Models for white mold

With the goals of interpretation and prediction

Author

Denis Shah

Published

Invalid Date

Packages

library(iml)           # for model-agnostic interpretation; Ver. 0.11.3
library(tidyverse)     # general wrangling; Ver. 2.0.0
library(rsample)       # for bootstrap resampling; Ver. 1.2.1
library(labelled)      # for general functions to work with labelled data; Ver. 2.13.0
library(gtsummary)     # automatic use of variable labels in summary tables; Ver. 2.0.3
library(kableExtra)    # nice table output in html; Ver. 1.4.0

library(ranger)        # random forest model fitting; Ver. 0.16.0
library(tuneRanger)    # for tuning a ranger model; Ver. 0.7
library(mlr)           # an mlr task is required by tuneRanger; Ver. 2.19.2
library(cutpointr)     # Cutpoint determination for classification; Ver. 1.2.0
library(furrr)         # For parallel processing; Ver. 0.3.1
library(bayestestR)    # For calculating HDI; Ver. 0.15.0
library(pROC)          # to get the C-statistic; Ver. 1.18.5
library(vip)           # Variable importance measures; Ver. 0.4.1
library(pdp)           # Partial dependence plots; Ver. 0.8.1

library(VSURF)         # for RF-based predictor selection; Ver. 1.2.0

library(predtools)     # Calibration plot; Ver. 0.0.3
library(patchwork)     # Combining plots; Ver. 1.3.0
library(lattice)       # Wireframe plots for 2-way pdp; Ver. 0.22-6

library(kernelshap)    # Getting SHAP values; Ver. 0.7.0
library(shapviz)       # Visualize said SHAP values; Ver. 0.9.6
library(tictoc)        # How long stuff takes; Ver.  1.2.1


library(rms)               # Fit logistic regression models; Ver. 6.8-2
library(CalibrationCurves) # Plot calibration curves; Ver.2.0.3
library(R.devices)     # suppressGraphics function to suppress the automatic outputting of a plot     
make_kable <- function(...) {
  # kable and kableExtra styling to avoid repetitively calling the styling over and over again
  # See: https://stackoverflow.com/questions/73718600/option-to-specify-default-kableextra-styling-in-rmarkdown
  # knitr::kable(...) %>%
  kable(..., format = "html", row.names = TRUE, align = 'l') %>%
    kable_styling(bootstrap_options = c("striped"), position = "left", font_size = 11, full_width = FALSE) 
}
load(here::here("DataFusion", "FusedData.RData"))  # X

Describing the data

# A good Intro to why variable labels may help is here:
# https://www.pipinghotdata.com/posts/2022-09-13-the-case-for-variable-labels-in-r/

# The variable names are not very descriptive (you yourself would have a hard time remembering what they encode or the units):
# names(X)

# Assign variable labels:
# dap = days after planting date
# vsw = volumetric soil water (0-7cm)

wm_metadata <- tribble(
    ~variable,           ~variable_label, 
    "subject",              "Snap bean field",
    "wm",                   "White mold presence",
    "drainage",             "Soil drainage class",
    "hydrol",               "Soil hydrological group",
    "cd",                   "Climate division",
    "harv.optim",           "Field harvested <=60 dap",
    "ph",                   "Soil pH",
    "om",                   "Soil organic matter content (%)",
    "log_sand_clay",        "Logratio sand:clay",
    "log_silt_clay",        "Logratio silt:clay",
    "cc35",                 "Canopy gap (cm)\nat 35 dap", 
    "rainto35dap",          "Total rain (mm)\nfrom planting to 35 dap",
    "rain36to50dap",        "Total rain (mm)\nfrom 36 to 50 dap", 
    "t2m_mean_to_4dap",     "Mean air temperature (\u00B0C)\nfrom planting to 4 dap",
    "sm_4dbp_to_3dap",      "Mean vsw (m³/m³)\nfrom 4 days before planting to 3 dap",
    "sm_5dap_to_15dap",     "Mean vsw (m³/m³)\nfrom 5 to 15 dap", 
    "sm_17dap_to_24dap",    "Mean vsw (m³/m³)\nfrom 17 to 24 dap",
    "sm_40dap_to_49dap",    "Mean vsw (m³/m³)\nfrom 40 to 49 dap", 
    "stsm_35dap_to_44dap",  "Mean soil temperature (\u00B0C):vsw (m³/m³)\nratio from 35 to 44 dap"
)

# To quickly assign the variable labels, first create a named vector via deframe() with values as the variable labels and names as the variable names.
wm_labels <- 
  wm_metadata |> 
  tibble::deframe()

# Now assign the labels using the splice operator. Using the splice operator, labels are assigned via matching against the variable name, which means that variable order does not matter.
X_labelled <- 
  X |> 
  labelled::set_variable_labels(!!!wm_labels)

# You can also assign the labels directly:
# X_labelled <- 
#   X |> 
#   set_variable_labels(
#     subject               = "Snap bean field",
#     wm                    = "White mold presence",
#     drainage              = "Soil drainage class",
#     hydrol                = "Soil hydrological group",
#     cd                    = "Climate division",
#     harv.optim            = "Field harvested <=60 dap",
#     ph                    = "Soil pH",
#     om                    = "Soil organic matter content (%)",
#     log_sand_clay         = "Logratio sand:clay",
#     log_silt_clay         = "Logratio silt:clay",
#     cc35                  = "Canopy gap (cm) at 35 dap", 
#     rainto35dap           = "Total rain (mm) from planting to 35 dap",
#     rain36to50dap         = "Total rain (mm) from 36 to 50 dap", 
#     t2m_mean_to_4dap      = "Mean air temperature (C) from planting to 4 dap",
#     sm_4dbp_to_3dap       = "Mean vsw from 4 days before planting to 3 dap",
#     sm_5dap_to_15dap      = "Mean vsw from 5 to 15 dap", 
#     sm_17dap_to_24dap     = "Mean vsw from 17 to 24 dap",
#     sm_40dap_to_49dap     = "Mean vsw from 40 to 49 dap", 
#     stsm_35dap_to_44dap   = "Mean soil temperature:vsw ratio from 35 to 44 dap"
#   )

Data dictionary

X_dictionary <- X_labelled |> 
  generate_dictionary()

X_dictionary |> 
  make_kable()
pos variable label col_type missing levels value_labels
1 1 subject Snap bean field dbl 0 NULL NULL
2 2 wm White mold presence dbl 0 NULL NULL
3 3 drainage Soil drainage class fct 0 Well_Drained , Poorly_Drained NULL
4 4 hydrol Soil hydrological group fct 0 A, B, C, D NULL
5 5 cd Climate division fct 0 Central Lakes, Great Lakes NULL
6 6 harv.optim Field harvested <=60 dap fct 0 Yes, No NULL
7 7 ph Soil pH dbl 0 NULL NULL
8 8 om Soil organic matter content (%) dbl 0 NULL NULL
9 9 log_sand_clay Logratio sand:clay dbl 0 NULL NULL
10 10 log_silt_clay Logratio silt:clay dbl 0 NULL NULL
11 11 cc35 Canopy gap (cm) at 35 dap dbl 0 NULL NULL
12 12 rainto35dap Total rain (mm) from planting to 35 dap dbl 0 NULL NULL
13 13 rain36to50dap Total rain (mm) from 36 to 50 dap dbl 0 NULL NULL
14 14 t2m_mean_to_4dap Mean air temperature (°C) from planting to 4 dap dbl 0 NULL NULL
15 15 sm_4dbp_to_3dap Mean vsw (m³/m³) from 4 days before planting to 3 dap dbl 0 NULL NULL
16 16 sm_5dap_to_15dap Mean vsw (m³/m³) from 5 to 15 dap dbl 0 NULL NULL
17 17 sm_17dap_to_24dap Mean vsw (m³/m³) from 17 to 24 dap dbl 0 NULL NULL
18 18 sm_40dap_to_49dap Mean vsw (m³/m³) from 40 to 49 dap dbl 0 NULL NULL
19 19 stsm_35dap_to_44dap Mean soil temperature (°C):vsw (m³/m³) ratio from 35 to 44 dap dbl 0 NULL NULL

The variables

X_labelled |> 
  dplyr::select(-subject) |>
  tbl_summary(
    by = wm
  ) |> 
  bold_labels()
Characteristic 0
N = 2831
1
N = 731
Soil drainage class

    Well_Drained 210 (74%) 56 (77%)
    Poorly_Drained 73 (26%) 17 (23%)
Soil hydrological group

    A 40 (14%) 14 (19%)
    B 9 (3.2%) 6 (8.2%)
    C 51 (18%) 5 (6.8%)
    D 183 (65%) 48 (66%)
Climate division

    Central Lakes 71 (25%) 22 (30%)
    Great Lakes 212 (75%) 51 (70%)
Field harvested <=60 dap 221 (78%) 49 (67%)
Soil pH 6.01 (5.86, 6.12) 6.01 (5.86, 6.13)
Soil organic matter content (%) 0.52 (0.49, 0.55) 0.53 (0.50, 0.55)
Logratio sand:clay 0.75 (0.27, 1.00) 0.70 (0.33, 0.96)
Logratio silt:clay 1.10 (1.03, 1.17) 1.11 (1.02, 1.23)
Canopy gap (cm) at 35 dap 51 (44, 56) 49 (44, 52)
Total rain (mm) from planting to 35 dap 110 (80, 160) 146 (89, 163)
Total rain (mm) from 36 to 50 dap 41 (23, 76) 60 (41, 80)
Mean air temperature (°C) from planting to 4 dap 20.10 (17.73, 21.24) 21.40 (20.06, 22.81)
Mean vsw (m³/m³) from 4 days before planting to 3 dap 0.30 (0.26, 0.33) 0.33 (0.29, 0.35)
Mean vsw (m³/m³) from 5 to 15 dap 0.30 (0.26, 0.35) 0.33 (0.29, 0.36)
Mean vsw (m³/m³) from 17 to 24 dap 0.30 (0.25, 0.33) 0.30 (0.28, 0.36)
Mean vsw (m³/m³) from 40 to 49 dap 0.29 (0.24, 0.36) 0.33 (0.30, 0.37)
Mean soil temperature (°C):vsw (m³/m³) ratio from 35 to 44 dap 68 (58, 84) 61 (55, 67)
1 n (%); Median (Q1, Q3)

Export the Table to a Word file to use in the manuscript

X_labelled |> 
  dplyr::select(-subject) |>
  tbl_summary(
    by = wm
  ) |> 
  bold_labels() |> 
  as_gt() |> 
  gt::gtsave(filename = here::here("Modeling", "foo.docx"))

RF model fitting

Fit a RF model to the set of 16 available predictor variables.

First, tune the model on the hyperparameters:

  • mtry
  • min.node.size
  • sample.fraction
X.df <- 
  X %>% 
  # Remove the subject column:
  # Also drainage, as prelim fitting showed it had NEGATIVE importance! Negative importance can happen, and indicates the variable is a worse predictor than random permutation.
  dplyr::select(-subject, -drainage) %>% 
  # set the response as a factor:
  dplyr::mutate(wm = factor(wm, levels = c("0", "1"))) %>% 
  # mlr expects a data frame:
  as.data.frame()
# Set seed for reproducibility:
set.seed(14092)

# For tuneRanger, a mlr task has to be created:
# With mlr, the target variable doesn't have to be a factor, but it's recommended for clarity.
# If you don't specify the positive argument, mlr will use the first factor level of the target variable as the positive class
cc.task <- mlr::makeClassifTask(data = X.df, target = "wm", positive = "1")


## with tuneRanger (following Probst et al and the documentation)
# Rough estimation of the Tuning time ~ 2 min
estimateTimeTuneRanger(cc.task)

# Tuning process:
tictoc::tic()
rf.tuned <- tuneRanger(cc.task, num.trees = 1000, show.info = FALSE)
tictoc::toc()  # 48 sec

# Save the fitted model and associated dataframe so that you don't have to re-tune:
save(rf.tuned, X.df, file = here::here("Modeling", "rf_tuned.RData"))

Examine the fitted model

# Load the fitted model and associated dataframe:
load(here::here("Modeling", "rf_tuned.RData"))  # rf.tuned, X.df

# Model with the new tuned hyperparameters
# rf.tuned$model
# Predicted probabilities directly from the tuned model.

# Prediction (a data frame with two columns: truth, response)
# where truth = 0, 1 for the observed wm status
# response = the fitted Prob(wm = 1)
# NOTE: here truth is a factor
pred <-
  predict(rf.tuned$model, newdata = X.df)$data %>%
  dplyr::select(truth, prob.1) %>%
  dplyr::rename(response = prob.1)

# Check:
head(pred)

The tuned hyperparameters

rf.tuned.pars <-
  list(
    mtry = rf.tuned$recommended.pars$mtry,
    min.node.size = rf.tuned$recommended.pars$min.node.size,
    sample.fraction = rf.tuned$recommended.pars$sample.fraction,
    num.trees = rf.tuned$model$learner$par.vals$num.trees
  ) 

rf.tuned.pars %>%
  as_tibble() %>%
  make_kable()
mtry min.node.size sample.fraction num.trees
1 2 9 0.844 1000

Estimating optimism

# Function to calculate Youden Index, Sensitivity, and Specificity using the `cutpointr` package:
calc_metrics_cutpointr <- function(pred) {
  opt_cut <- cutpointr::cutpointr(
    data = pred,
    x = response,
    class = truth,
    direction = ">=",
    pos_class = 1,
    neg_class = 0,
    method = maximize_metric,
    metric = youden
  )
  
  list(
    YI = opt_cut$youden,
    Se = as.numeric(opt_cut$sensitivity),
    Sp = as.numeric(opt_cut$specificity),
    AUC = opt_cut$AUC,
    brier = mean((pred$response - ifelse(pred$truth == "1", 1, 0))^2)
  )
}

# 1. Prepare predictions dataframe:
pred <-
  predict(rf.tuned$model, newdata = X.df)$data %>%
  dplyr::select(truth, prob.1) %>%
  dplyr::rename(response = prob.1)

# 2. Calculate apparent performance
apparent_perf <- calc_metrics_cutpointr(pred)

# 3. Bootstrap process
bootstrap_fxn <- function(i) {
  # Print the current iteration
  cat("Currently processing iteration: ", i, "\n")
  
  # a. Generate bootstrap sample
  boot_sample <- 
    rsample::bootstraps(X.df, times = 1, strata = wm)$splits[[1]] %>%
    rsample::analysis()
  
  # b. Create classification task and fit model on bootstrap sample with error handling
  cc.task <- mlr::makeClassifTask(data = boot_sample, target = "wm", positive = "1")
  
  boot_model <- tryCatch(
    expr = {
      tuneRanger::tuneRanger(cc.task, num.trees = 1000, show.info = FALSE)$model
    },
    error = function(e) {
      cat("Error occurred in iteration", i, ":", e$message, "\n")
      return(NULL) # Return NULL if an error occurs
    }
  )
  
  # If boot_model is NULL, skip this iteration
  if (is.null(boot_model)) {
    cat("Skipping iteration", i, "due to error.\n")
    return(NULL)
  }
  
  # c. Calculate metrics on bootstrap sample (training performance)
  boot_pred <- 
    predict(boot_model, newdata = boot_sample)$data %>%
    dplyr::select(truth, prob.1) %>%
    dplyr::rename(response = prob.1)
  
  train_perf <- calc_metrics_cutpointr(boot_pred)
  
  # d & e. Apply bootstrap model to original dataset and calculate metrics (test performance)
  test_pred <- predict(boot_model, newdata = X.df)$data %>%
    dplyr::select(truth, prob.1) %>%
    dplyr::rename(response = prob.1)
  
  test_perf <- calc_metrics_cutpointr(test_pred)
  
  # f. Calculate optimism
  optimism <- purrr::map2_dbl(train_perf, test_perf, \(x, y) x-y)
  
  tibble(
    iteration = i,
    metric = names(train_perf),
    train = unlist(train_perf),
    test = unlist(test_perf),
    optimism = optimism)
  }

set.seed(123)
# Run bootstrap_fxn in parallel:
my.seeds <- c(1:1000)

plan(list(tweak(multisession, workers = 5), sequential))
tictoc::tic()
bootstrap_results <- 
  furrr::future_map(my.seeds, bootstrap_fxn, .options = furrr_options(seed = TRUE)) %>% 
  purrr::list_rbind()
tictoc::toc()  # 11404.61 sec ~ 3.17 hr

plan(sequential)

# 4. Apparent performance bootstrap HDI
apparent_CI <-
  bootstrap_results %>% 
  dplyr::group_by(metric) %>% 
  dplyr::summarize(app_LCI = bayestestR::hdi(train, ci = 0.95)$CI_low, 
                   app_UCI = bayestestR::hdi(train, ci = 0.95)$CI_high)

# 5. Average optimism across all bootstrap samples
avg_optimism <- 
  bootstrap_results %>%
  dplyr::group_by(metric) %>%
  dplyr::summarize(avg_optimism = mean(optimism))

# 6. Calculate optimism-corrected performance
corrected_perf <- 
  tibble(
    metric = names(apparent_perf),
    apparent = unlist(apparent_perf)
  ) %>%
  dplyr::left_join(apparent_CI, by = "metric") %>%
  dplyr::left_join(avg_optimism, by = "metric") %>%
  dplyr::mutate(corrected = apparent - avg_optimism)


# Save the results so that you don't have to ever, ever repeat this time-consuming process:
save(pred, apparent_perf, bootstrap_results, corrected_perf, file = here::here("Modeling", "RFoptimism.RData"))
# Load the model-fitted probs and the optimism estimates:
# pred, apparent_perf, bootstrap_results, corrected_perf
load(here::here("Modeling", "RFoptimism.RData"))  

corrected_perf %>% 
  make_kable()
metric apparent app_LCI app_UCI avg_optimism corrected
1 YI 0.968 0.975 1.00 0.196 0.772
2 Se 1.000 0.986 1.00 0.121 0.879
3 Sp 0.968 0.979 1.00 0.075 0.893
4 AUC 0.997 0.999 1.00 0.046 0.951
5 brier 0.039 0.001 0.01 -0.052 0.090

C-statistic

# Get the C-statistic (AUROC):
# Assuming your data frame is called 'pred' with columns 'truth' and 'response'
roc_obj <- pROC::roc(pred$truth, pred$response, levels = c(0, 1))

# 95% CI for the C-statistic:
ci_obj <- ci.auc(roc_obj)


res_C <- matrix(
  c(ci_obj[2],
    ci_obj[1],
    ci_obj[3],
    corrected_perf %>% dplyr::filter(metric == "AUC") %>% dplyr::pull(corrected) %>% unname(),
    NA,
    NA),
  
  nrow = 1,
  ncol = 6,
  byrow = T,
  
  dimnames = list(
    c("C-statistic"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2)
  )
)

res_C %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, 
                     "Apparent" = 3, 
                     "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
C-statistic 0.997 0.995 1 0.951 NA NA

Brier score

# Brier score:
tibble::tibble(`Brier (apparent)` = corrected_perf %>% dplyr::filter(metric == "brier") %>% dplyr::pull(apparent) %>% unname(), 
               `Brier (internal)` = corrected_perf %>% dplyr::filter(metric == "brier") %>% dplyr::pull(corrected) %>% unname()) %>% 
  make_kable()
Brier (apparent) Brier (internal)
1 0.039 0.09

Calibration

Calibration plot

# Assuming 'pred' is your data frame with 'truth' and 'response' columns
# For the function to work, you need to represent truth as a numeric
RF1CC <- CalibrationCurves::valProbggplot(pred$response, ifelse(pred$truth == "1", 1, 0))$ggPlot
Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
RF1CC

Calibration plot for a tuned random forest model fit to 16 predictors.

Histogram of predicted probabilities

pred %>%
  ggplot(aes(response)) +
  # geom_histogram(col = "white", bins = 20) +
  geom_histogram(fill = "orange", col = "orange3", bins = 20) +
  facet_wrap(~ as.factor(truth), ncol = 1, 
             labeller = as_labeller(c("0" = "No White Mold", "1" = "White Mold Present"))) +
  geom_rug(col = "blue", alpha = 0.5) + 
  labs(x = "Probability estimate of white mold", y = "No. of fields") +
  theme_bw()

Histogram of fitted probabilities for a tuned random forest model utilizing all 16 predictors.

ICI

Apparent
# As a background check:
# Get the apparent estimate using CalibrationCurves, which generates a flexible calibration curve based on loess as the default.
RFcalPerf <- suppressGraphics({
  CalibrationCurves::val.prob.ci.2(pred$response, ifelse(pred$truth == "1", 1, 0))
})

RFcalPerf$stats["Eavg"]
# Calibration measures ICI, E50, E90 based on secondary logistic regression
# Create a copy of the X.df data frame, with pred added as a column:
Xmod <-
  X.df %>%
  # For the loess, need wm as a numeric:
  dplyr::mutate(wm = as.numeric(wm)-1) %>% 
  dplyr::mutate(pred = pred$response)

# Use loess to fit the flexible calibration curve:
fit_cal <- loess(wm ~ pred, 
                 data = Xmod, 
                 span = 0.75, 
                 degree = 2, 
                 family = "gaussian")

cal_obs <- predict(fit_cal)

dt_cal <- 
  cbind.data.frame(
    "obs" = cal_obs,
    "pred" = pred$response)


res_calmeas <-
  c(
    "ICI" = mean(abs(dt_cal$obs - dt_cal$pred)),
    "E50" = median(abs(dt_cal$obs - dt_cal$pred)),
    "E90" = unname(quantile(abs(dt_cal$obs - dt_cal$pred), 
                            probs = .90))
)


## Bootstrap confidence intervals for the calibration  measures (ICI, E50, E90)
alpha <- .05
# Set B = 2000 although it takes more time:
B <- 2000 
set.seed(2022)
# Using stratification on wm because of the imbalance between the classes:
vboot <- rsample::bootstraps(X.df, times = B, strata = wm)

# Bootstrap calibration measures
numsum_boot <- function(split) {
  
  boot_pred <- predict(rf.tuned$model, newdata = rsample::analysis(split))$data %>%
  dplyr::pull(prob.1)

  v_cal <- loess(wm ~ boot_pred, 
                 data = rsample::analysis(split) %>% dplyr::mutate(wm = as.numeric(wm)-1), 
                 span = 0.75, 
                 degree = 2, 
                 family = "gaussian")

  cal_obs_boot <- predict(v_cal)
  
  # Save objects needed
  db_cal_boot <- data.frame(
    "obs" = cal_obs_boot,
    "pred" = boot_pred
    )
  
  absdiff_boot <- abs(db_cal_boot$obs - db_cal_boot$pred)

  res_cal_boot <- data.frame(
    "ICI" = mean(absdiff_boot),
    "E50" = quantile(absdiff_boot, probs = .5),
    "E90" = quantile(absdiff_boot, probs = .9)
  )
  }  # end of function

# Example of use:
# (bar <- purrr::pluck(vboot, "splits", 1) %>% numsum_boot())


tictoc::tic()
numsum_b <- 
  vboot %>% 
  dplyr::mutate(num_cal_boot = purrr::map(splits, numsum_boot),
                
                ICI = purrr::map_dbl(num_cal_boot, ~ .x$ICI),
         
                E50 = purrr::map_dbl(num_cal_boot, ~ .x$E50),
         
                E90 = purrr::map_dbl(num_cal_boot, ~ .x$E90)
         )
tictoc::toc()  # ~ 74 sec
46.36 sec elapsed
# numsum_b is a tibble with 2,000 rows
# Now we need to process the results to get the bootstrap CIs:
alpha <- .05
res_numcal <- matrix(c(res_calmeas["ICI"],
                       quantile(numsum_b$ICI, probs = alpha / 2),
                       quantile(numsum_b$ICI, 
                                probs = 1 - alpha / 2),
                       
                       res_calmeas["E50"],
                       quantile(numsum_b$E50, probs = alpha / 2),
                       quantile(numsum_b$E50, probs = 1 - alpha / 2),
                       
                       res_calmeas["E90"],
                       quantile(numsum_b$E90, probs = alpha / 2),
                       quantile(numsum_b$E90, probs = 1 - alpha / 2)
                       ),
                     nrow = 1,
                     ncol = 9,
                     byrow = T,
                     dimnames = list(
                       c("Apparent"),
                       rep(
                         c("Estimate", "Lower.95", "Upper.95"),
                         3))
)


# Present the results:
res_numcal %>% 
  make_kable()  %>% 
  add_header_above(c(" " = 1,  
                     "ICI" = 3, 
                     "E50" = 3,
                     "E90" = 3))
ICI
E50
E90
Estimate Lower.95 Upper.95 Estimate Lower.95 Upper.95 Estimate Lower.95 Upper.95
Apparent 0.102 0.093 0.111 0.077 0.065 0.086 0.255 0.218 0.291

Model-agnostic interpretation

Notes

When using model agnostic procedures, additional code preparation is often required. The iml (Molnar 2019) package uses purely model agnostic procedures. Consequently, we need to create a model agnostic object that contains three components:

  1. A data frame with just the features (must be of class “data.frame”).

  2. A vector with the actual responses (must be numeric—0/1 for binary classification problems).

  3. A custom function that will take the features from (1), apply the ML algorithm, and return the predicted values as a vector.

# We need `X.df` from the chunk `data-setup-for-ranger-tuning`. 

# Refit model (with the tuned parameters) using the ranger package directly (makes life easier):
# Reminder: the tuned parameter settings:
# rf.tuned.pars

set.seed(14092)
m1 <- ranger(wm ~ ., 
             data = X.df, 
             importance = 'permutation',
             scale.permutation.importance = TRUE,
             # Set the hyperparameters:
             num.trees = rf.tuned.pars$num.trees,
             mtry = rf.tuned.pars$mtry,
             min.node.size = rf.tuned.pars$min.node.size, 
             sample.fraction = rf.tuned.pars$sample.fraction,
             probability = TRUE)
# 1) create a data frame with just the features
features <- X.df %>% dplyr::select(-wm)

# 2) Create a vector with the actual responses
response <-  X.df %>% dplyr::mutate(wm = ifelse(wm == "1", 1, 0)) %>% dplyr::pull(wm)

# 3) Create custom predict function that returns the predicted values as a vector
pred.fxn <- function(object, newdata) {
  results <- predict(object, data = newdata, type = "response")$predictions[, 2]
  return(results)
}

# Example of prediction output:
# pred.fxn(m1, X.df) %>% head()

# Once we have these three components we can create our model agnostic objects for the iml package, which will just pass these downstream components (along with the ML model) to other functions.
# iml model agnostic object:
components_iml <- Predictor$new(
  model = m1, 
  data = features, 
  y = response, 
  predict.fun = pred.fxn
)

Variable importance

The variable importance plot shows that not all predictors may be necessary.

# You can get an importance measure from ranger:
# ranger::importance(m1)

# List of available metrics in `vip` for binary classification:
# list_metrics() %>% 
#   dplyr::filter(!task == "Regression") %>% 
#   make_kable()


# Here we will use the vi_permute function to set some other options, like use logloss as the metric.
tictoc::tic()
wm_vip <- 
  vi_permute(
    m1,
  # metric arg calls yardstick, which expects wm = 1 (target) as the FIRST level; and as a factor:
  train = X.df %>% dplyr::mutate(wm = factor(wm, levels = c("1", "0"))),
  target = "wm",
  metric = "logloss",
  smaller_is_better = TRUE,  # see list_metrics()
  nsim = 100,
  pred_wrapper = pred.fxn
)
tictoc::toc() # 48 sec

# Save the result:
save(wm_vip, file = here::here("Modeling", "vip.RData"))
# Load the vip results:
load(here::here("Modeling", "vip.RData"))

# A named vector of the labels:
wm_vip_labels <-
  wm_metadata %>% 
  dplyr::filter(!variable %in% c("subject", "wm", "drainage")) %>% 
  tibble::deframe()

# Plot with the more descriptive variable labels: 
wm_vip %>%
  dplyr::arrange(Importance) %>% 
  dplyr::mutate(rel.imp = 100*Importance/sum(Importance)) %>% 
  # use reorder() to sort the plot so highest importance is at the top:
  ggplot(., aes(y = reorder(Variable, rel.imp), x = rel.imp)) +  
  geom_point(size = 3, color = "orange") +
  scale_y_discrete(labels = wm_vip_labels, name = NULL) +
  # theme_minimal() +
  theme_bw() +
  scale_x_continuous(name = "Relative Importance") + 
  theme(axis.title.x = element_text(face = "bold", size = 11)) +
  theme(axis.text.y = element_text(size = 8))

Permutation-based variable importance for a tuned random forest model. Importance was computed using log loss as the performance metric.

RF variable selection

Not all predictors may be important (or necessary) for good prediction. In this section, we’ll use random forest based variable selection, via the VSURF package, to possibly hone the predictor set.

# The response as a factor and vector:
X_vsurf.y <-
  X.df %>% 
  # For VSURF, wm must be a factor and the 1st level is the one you wish to predict:
  dplyr::mutate(wm = factor(wm, levels = c(1, 0))) %>% 
  dplyr::pull(wm)

# The predictor vars:
X_vsurf.x <-
  X.df %>%
  dplyr::select(-wm)
  
# Using ranger as the engine, parallel processing on 4 cores (does not take long ~5 sec):
set.seed(2331)
foo <- VSURF(x = X_vsurf.x, y = X_vsurf.y, parallel = TRUE, ncores = 4, RFimplem = "ranger", clusterType = "ranger")

# The selected vars:
selected_vars <- X_vsurf.x %>% dplyr::select(foo$varselect.pred) %>% names()

# Create a new dataset with just these vars and the response:
X.sel <- 
  X.df %>% 
  dplyr::select(all_of(selected_vars), wm)

Tuning on the selected vars

No output to show.

Just presenting the code on how it was done.

# mlr expects a data frame:
class(X.sel)  # we are good

# Set seed for reproducibility:
set.seed(5309)  # Jenny, I got your number!

# For tuneRanger, a mlr task has to be created:
cc.task <- mlr::makeClassifTask(data = X.sel, target = "wm", positive = "1")

# Tuning process:
tictoc::tic()
rf.tuned.sel <- tuneRanger(cc.task, num.trees = 500, show.info = FALSE) # Reducing the no. trees as we don't have many predictors
tictoc::toc()  # 42 sec

# Save the fitted model so that you don't have to re-tune:
save(rf.tuned.sel, X.sel, selected_vars, file = here::here("Modeling", "rf_tuned_sel.RData"))

Examine model fit

load(here::here("Modeling", "rf_tuned_sel.RData"))  # rf.tuned.sel, X.sel, selected_vars

# The tuned hyperparameters:
# rf.tuned.sel$model

The tuned hyperparameters

rf.tuned.sel.pars <-
  list(
    mtry = rf.tuned.sel$recommended.pars$mtry,
    min.node.size = rf.tuned.sel$recommended.pars$min.node.size,
    sample.fraction = rf.tuned.sel$recommended.pars$sample.fraction,
    num.trees = rf.tuned.sel$model$learner$par.vals$num.trees
    ) 

rf.tuned.sel.pars %>%
  as_tibble() %>%
  make_kable()
mtry min.node.size sample.fraction num.trees
1 1 6 0.455 500

Estimating optimism

# 1. Prepare predictions dataframe:
sel_pred <-
  predict(rf.tuned.sel$model, newdata = X.sel)$data %>%
  dplyr::select(truth, prob.1) %>%
  dplyr::rename(response = prob.1)

# 2. Calculate apparent performance
sel_apparent_perf <- calc_metrics_cutpointr(sel_pred)

# 3. Bootstrap process
bootstrap_fxn <- function(i) {
  # Print the current iteration
  cat("Currently processing iteration: ", i, "\n")
  
  # a. Generate bootstrap sample
  boot_sample <- 
    rsample::bootstraps(X.sel, times = 1, strata = wm)$splits[[1]] %>%
    rsample::analysis()
  
  # b. Create classification task and fit model on bootstrap sample with error handling
  cc.task <- mlr::makeClassifTask(data = boot_sample, target = "wm", positive = "1")
  
  boot_model <- tryCatch(
    expr = {
      tuneRanger::tuneRanger(cc.task, num.trees = 500, show.info = FALSE)$model
    },
    error = function(e) {
      cat("Error occurred in iteration", i, ":", e$message, "\n")
      return(NULL) # Return NULL if an error occurs
    }
  )
  
  # If boot_model is NULL, skip this iteration
  if (is.null(boot_model)) {
    cat("Skipping iteration", i, "due to error.\n")
    return(NULL)
  }
  
  # c. Calculate metrics on bootstrap sample (training performance)
  boot_pred <- 
    predict(boot_model, newdata = boot_sample)$data %>%
    dplyr::select(truth, prob.1) %>%
    dplyr::rename(response = prob.1)
  
  train_perf <- calc_metrics_cutpointr(boot_pred)
  
  # d & e. Apply bootstrap model to original dataset and calculate metrics (test performance)
  test_pred <- predict(boot_model, newdata = X.sel)$data %>%
    dplyr::select(truth, prob.1) %>%
    dplyr::rename(response = prob.1)
  
  test_perf <- calc_metrics_cutpointr(test_pred)
  
  # f. Calculate optimism
  optimism <- purrr::map2_dbl(train_perf, test_perf, \(x, y) x-y)
  
  tibble(
    iteration = i,
    metric = names(train_perf),
    train = unlist(train_perf),
    test = unlist(test_perf),
    optimism = optimism)
  }

set.seed(123)
# Run bootstrap_fxn in parallel:
my.seeds <- c(1:1000)

plan(list(tweak(multisession, workers = 5), sequential))
tictoc::tic()
bootstrap_results <- 
  furrr::future_map(my.seeds, bootstrap_fxn, .options = furrr_options(seed = TRUE)) %>% 
  purrr::list_rbind()
tictoc::toc()  # 9755.02 sec ~ 2.7 hr

plan(sequential)


# 4. Apparent performance bootstrap HDI
apparent_CI <-
  bootstrap_results %>%
  dplyr::group_by(metric) %>%
  dplyr::summarize(app_LCI = bayestestR::hdi(train, ci = 0.95)$CI_low, 
                   app_UCI = bayestestR::hdi(train, ci = 0.95)$CI_high)


# 5. Average optimism across all bootstrap samples
avg_optimism <- 
  bootstrap_results %>%
  dplyr::group_by(metric) %>%
  dplyr::summarize(avg_optimism = mean(optimism))

# 6. Calculate optimism-corrected performance
sel_corrected_perf <- 
  tibble(
    metric = names(sel_apparent_perf),
    apparent = unlist(sel_apparent_perf)
    ) %>%
  dplyr::left_join(apparent_CI, by = "metric") %>%
  dplyr::left_join(avg_optimism, by = "metric") %>%
  dplyr::mutate(corrected = apparent - avg_optimism)


# Save the results so that you don't have to ever, ever repeat this time-consuming process:
save(sel_pred, sel_apparent_perf, bootstrap_results, sel_corrected_perf, file = here::here("Modeling", "RFseloptimism.RData"))
# Load the model-fitted probs and the optimism estimates:
# sel_pred, sel_apparent_perf, bootstrap_results, sel_corrected_perf
load(here::here("Modeling", "RFseloptimism.RData"))  

sel_corrected_perf %>% 
  make_kable()
metric apparent app_LCI app_UCI avg_optimism corrected
1 YI 0.915 0.972 1.000 0.198 0.717
2 Se 1.000 0.986 1.000 0.122 0.878
3 Sp 0.915 0.975 1.000 0.076 0.839
4 AUC 0.987 0.999 1.000 0.053 0.934
5 brier 0.065 0.002 0.014 -0.052 0.117

C-statistic

The AUROC is still extremely high, even with this reduced set of predictors.

# Get the C-statistic (AUROC):
# Assuming your data frame is called 'sel_pred' with columns 'truth' and 'response'
roc_obj <- pROC::roc(sel_pred$truth, sel_pred$response, levels = c(0, 1))

# 95% CI for the C-statistic:
ci_obj <- ci.auc(roc_obj)

res_C <- matrix(
  c(ci_obj[2],
    ci_obj[1],
    ci_obj[3],
    sel_corrected_perf %>% dplyr::filter(metric == "AUC") %>% dplyr::pull(corrected) %>% unname(),
    NA,
    NA),
  
  nrow = 1,
  ncol = 6,
  byrow = T,
  
  dimnames = list(
    c("C-statistic"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2)
  )
)

res_C %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, 
                     "Apparent" = 3, 
                     "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
C-statistic 0.987 0.979 0.995 0.934 NA NA

Brier score

# Brier score:
tibble::tibble(`Brier (apparent)` = sel_corrected_perf %>% dplyr::filter(metric == "brier") %>% dplyr::pull(apparent) %>% unname(), 
               `Brier (internal)` = sel_corrected_perf %>% dplyr::filter(metric == "brier") %>% dplyr::pull(corrected) %>% unname()) %>% 
  make_kable()
Brier (apparent) Brier (internal)
1 0.065 0.117

Calibration

Calibration plot

Indicates:

  • poorly calibrated predicted probabilities

This model may be useful for explanation; that is, how the variables in the model influence or determine the predicted outcome. We cannot interpret such relationships causally.

Would not be advisable to use this model for predicted probability, because of the poor calibration, unless it can be re-calibrated.

For more information on calibration and re-calibration approaches, see this tidymodels post.

It may still be useful for discrimination.

# Assuming 'sel_pred' is your data frame with 'truth' and 'response' columns
calibration_plot(
  data = sel_pred %>% dplyr::mutate(truth = ifelse(truth == "1", 1, 0)),
  obs = "truth",
  pred = "response",
  title = "Calibration Plot for Random Forest Model",
  xlab = "Predicted Probability",
  ylab = "Observed Proportion"
)

# This is a nicer format I think:
RF2CC <- CalibrationCurves::valProbggplot(sel_pred$response, ifelse(sel_pred$truth == "1", 1, 0))$ggPlot

RF2CC

Calibration plot for a tuned random forest model fit to a selected subset of the predictors. The subset was identified by the VSURF algorithm.

Histogram of predicted probabilities

sel_pred %>%
  ggplot(aes(response)) +
  geom_histogram(fill = "orange", col = "orange3", bins = 20) +
  facet_wrap(~ as.factor(truth), ncol = 1, 
             labeller = as_labeller(c("0" = "No White Mold", "1" = "White Mold Present"))) +
  geom_rug(col = "blue", alpha = 0.5) + 
  labs(x = "Probability estimate of white mold", y = "No. of fields") +
  theme_bw()

Histogram of fitted probabilities for a tuned random forest model utilizing a subset of predictors identified by the VSURF algorithm.

ICI

Apparent
# As a background check:
# Get the apparent estimate using CalibrationCurves, which generates a flexible calibration curve based on loess as the default.
RFsel_calPerf <- suppressGraphics({
  CalibrationCurves::val.prob.ci.2(sel_pred$response, ifelse(sel_pred$truth == "1", 1, 0))
})

RFsel_calPerf$stats["Eavg"]
# Calibration measures ICI, E50, E90 based on secondary logistic regression
# Create a copy of the X.sel data frame, with sel_pred added as a column:
Xmod <-
  X.sel %>%
  # For the loess, need wm as a numeric:
  dplyr::mutate(wm = as.numeric(wm)-1) %>% 
  dplyr::mutate(pred = sel_pred$response)

# Use loess to fit the flexible calibration curve:
fit_cal <- loess(wm ~ pred, 
                 data = Xmod, 
                 span = 0.75, 
                 degree = 2, 
                 family = "gaussian")

cal_obs <- predict(fit_cal)

dt_cal <- 
  cbind.data.frame(
    "obs" = cal_obs,
    "pred" = sel_pred$response)


res_calmeas <-
  c(
    "ICI" = mean(abs(dt_cal$obs - dt_cal$pred)),
    "E50" = median(abs(dt_cal$obs - dt_cal$pred)),
    "E90" = unname(quantile(abs(dt_cal$obs - dt_cal$pred), 
                            probs = .90))
)


## Bootstrap confidence intervals for the calibration  measures (ICI, E50, E90)
alpha <- .05
# Set B = 2000 although it takes more time:
B <- 2000 
set.seed(2022)
# Using stratification on wm because of the imbalance between the classes:
vboot <- rsample::bootstraps(X.sel, times = B, strata = wm)

# Bootstrap calibration measures
numsum_boot <- function(split) {
  
  boot_pred <- predict(rf.tuned.sel$model, newdata = rsample::analysis(split))$data %>%
  dplyr::pull(prob.1)

  v_cal <- loess(wm ~ boot_pred, 
                 data = rsample::analysis(split) %>% dplyr::mutate(wm = as.numeric(wm)-1), 
                 span = 0.75, 
                 degree = 2, 
                 family = "gaussian")

  cal_obs_boot <- predict(v_cal)
  
  # Save objects needed
  db_cal_boot <- data.frame(
    "obs" = cal_obs_boot,
    "pred" = boot_pred
    )
  
  absdiff_boot <- abs(db_cal_boot$obs - db_cal_boot$pred)

  res_cal_boot <- data.frame(
    "ICI" = mean(absdiff_boot),
    "E50" = quantile(absdiff_boot, probs = .5),
    "E90" = quantile(absdiff_boot, probs = .9)
  )
  }  # end of function

# Example of use:
# (bar <- purrr::pluck(vboot, "splits", 1) %>% numsum_boot())


tictoc::tic()
numsum_b <- 
  vboot %>% 
  dplyr::mutate(num_cal_boot = purrr::map(splits, numsum_boot),
                
                ICI = purrr::map_dbl(num_cal_boot, ~ .x$ICI),
         
                E50 = purrr::map_dbl(num_cal_boot, ~ .x$E50),
         
                E90 = purrr::map_dbl(num_cal_boot, ~ .x$E90)
         )
tictoc::toc()  # ~ 74 sec
23.74 sec elapsed
# numsum_b is a tibble with 2,000 rows
# Now we need to process the results to get the bootstrap CIs:
alpha <- .05
res_numcal <- matrix(c(res_calmeas["ICI"],
                       quantile(numsum_b$ICI, probs = alpha / 2),
                       quantile(numsum_b$ICI, 
                                probs = 1 - alpha / 2),
                       
                       res_calmeas["E50"],
                       quantile(numsum_b$E50, probs = alpha / 2),
                       quantile(numsum_b$E50, probs = 1 - alpha / 2),
                       
                       res_calmeas["E90"],
                       quantile(numsum_b$E90, probs = alpha / 2),
                       quantile(numsum_b$E90, probs = 1 - alpha / 2)
                       ),
                     nrow = 1,
                     ncol = 9,
                     byrow = T,
                     dimnames = list(
                       c("Apparent"),
                       rep(
                         c("Estimate", "Lower.95", "Upper.95"),
                         3))
)


# Present the results:
res_numcal %>% 
  make_kable()  %>% 
  add_header_above(c(" " = 1,  
                     "ICI" = 3, 
                     "E50" = 3,
                     "E90" = 3))
ICI
E50
E90
Estimate Lower.95 Upper.95 Estimate Lower.95 Upper.95 Estimate Lower.95 Upper.95
Apparent 0.124 0.114 0.135 0.101 0.093 0.115 0.301 0.263 0.346

Comparison to the RF1 model

(RF1CC + RF2CC) +
  plot_annotation(tag_levels = "A") +
  plot_layout(nrow = 2, byrow = TRUE, guides = "collect") & 
  cowplot::theme_half_open(font_size = 12) &
  theme(legend.position = "bottom", 
        legend.direction = "horizontal")

ggsave("figs/RF_calibration_plot.png", dpi = 600, height = 7, width = 5, bg = "white" )
ggsave("figs/RF_calibration_plot.pdf", dpi = 600, height = 7, width = 5, bg = "white" )

Re-Calibration

Here we just use simple Platt scaling manually. NOTE: Platt scaling assumes a sigmoid relationship between the original probabilities and the true probabilities. If this assumption doesn’t hold for your data, you might want to consider other calibration methods like isotonic regression (which we do not pursue here).

You can see that a few things: (i) one’s assessment of calibration can be biased by the graphical output of the program used to generate the calibration plot. The simpler plot is generated by the calibration_plot function in the predtools package. It uses a simple form of binning. The second plot is generated by the valProbggplot function in the CalibrationCurves, which is based on functions from the rms package, and provides some additional statistics with the plot.

While the slope has improved, we can see that the re-calibrated probs are closer to the extremes of 0 and 1.

In summary, this short exercise shows that re-calibration does require some work – Platt scaling works partially here. The small sample size here is also an issue that may be having an effect.

Re-calibration plot

predtools package
# Fit a logistic regression model to the response using the predicted probs from the RF model: 
platt_model <- glm(truth ~ response, data = sel_pred, family = "binomial")

# Apply calibration to the data:
calibrated_probs <- predict(platt_model, newdata = data.frame(response = sel_pred$response), type = "response")

calibrated_data <- data.frame(truth = sel_pred$truth, response = calibrated_probs)

calibration_plot(
  data = calibrated_data %>% dplyr::mutate(truth = ifelse(truth == "1", 1, 0)),
  obs = "truth",
  pred = "response",
  title = "Re-Calibration Plot for Random Forest Model",
  xlab = "Predicted Probability",
  ylab = "Observed Proportion"
)

Calibration plot for the recalibrated random forest model, plotted using the predtools package
CalibrationCurves package
# This is a nicer format I think:
CalibrationCurves::valProbggplot(calibrated_data$response, ifelse(calibrated_data$truth == "1", 1, 0))$ggPlot

Calibration plot for the recalibrated random forest model, plotted using the CalibrationCurves package

Histogram of calibrated probabilities

calibrated_data %>%
  ggplot(aes(response)) +
  geom_histogram(fill = "orange", col = "orange3", bins = 20) +
  facet_wrap(~ as.factor(truth), ncol = 1, 
             labeller = as_labeller(c("0" = "No White Mold", "1" = "White Mold Present"))) +
  geom_rug(col = "blue", alpha = 0.5) + 
  labs(x = "Calibrated probability estimate of white mold") +
  theme_bw()

Model-agnostic interpretation

# Reminder: the tuned params:
rf.tuned.sel.pars


set.seed(14092)
m2 <- ranger(wm ~ ., 
             data = X.sel, 
             importance = 'permutation',
             scale.permutation.importance = TRUE,
             # Set the hyperparameters:
             num.trees = rf.tuned.sel.pars$num.trees,
             mtry = rf.tuned.sel.pars$mtry,
             min.node.size = rf.tuned.sel.pars$min.node.size, 
             sample.fraction = rf.tuned.sel.pars$sample.fraction,
             probability = TRUE)
# 1) create a data frame with just the features
features <- X.sel %>% dplyr::select(-wm)

# 2) Create a vector with the actual responses
response <-  X.sel %>% dplyr::mutate(wm = ifelse(wm == "1", 1, 0)) %>% dplyr::pull(wm)

# 3) Create custom predict function that returns the predicted values as a vector
pred.fxn <- function(object, newdata) {
  results <- predict(object, data = newdata, type = "response")$predictions[, 2]
  return(results)
}

# Example:
# pred.fxn(m2, X.sel) %>% head()

# Once we have these three components we can create our model agnostic objects for the iml package, which will just pass these downstream components (along with the ML model) to other functions.
# iml model agnostic object:
components_iml <- Predictor$new(
  model = m2, 
  data = features, 
  y = response, 
  predict.fun = pred.fxn
)

Variable importance

# Here we will use the vi_permute function to set some other options, like use logloss as the metric.
wm_vip <- 
  vi_permute(
    m2,
  # metric arg calls yardstick, which expects wm = 1 (target) as the FIRST level; and as a factor:
  train = X.sel %>% dplyr::mutate(wm = factor(wm, levels = c("1", "0"))),
  target = "wm",
  metric = "logloss",
  smaller_is_better = TRUE,  # see list_metrics()
  nsim = 10,
  pred_wrapper = pred.fxn
)

# A named vector of the labels:
wm_vip_labels <-
  wm_metadata %>% 
  dplyr::filter(variable %in% selected_vars) %>% 
  tibble::deframe()

# Plot with the more descriptive variable labels: 
wm_vip %>%
  dplyr::arrange(Importance) %>% 
  dplyr::mutate(rel.imp = 100*Importance/sum(Importance)) %>% 
  # use reorder() to sort the plot so highest importance is at the top:
  ggplot(., aes(y = reorder(Variable, rel.imp), x = rel.imp)) +  
  geom_point(size = 3, color = "orange") +
  scale_y_discrete(labels = wm_vip_labels, name = NULL) +
  theme_bw() +
  scale_x_continuous(name = "Relative Importance") + 
  theme(axis.title.x = element_text(face = "bold", size = 11))

Permutation-based variable importance for a tuned random forest model fit to a VSURF-identified subset of variables. Importance was computed using log loss as the performance metric.

Partial dependence

I think the main point these plots demonstrate is the non-linearity in the relationship between the response and the predictors. On the probability scale (y-axis), we would expect a S-shaped curve. If the logit were plotted on the y-axis instead, the relationships should appear linear.

Probability scale

# Reminder: the set of vars
# selected_vars

# If grid.resolution left NULL, it will default to the minimum between 51 and the number of unique data points for each of the continuous independent variables listed in pred.var. Therefore, let's get an idea of the number of unique data points for each var. I will use this info to set grid.resolution to a value that will be above the minimum of 51 grid points, but not too high to slow down computation.

# length(unique(X.sel$t2m_mean_to_4dap))     # 297
# length(unique(X.sel$sm_40dap_to_49dap))    # 297
# length(unique(X.sel$stsm_35dap_to_44dap))  # 297
# length(unique(X.sel$sm_5dap_to_15dap))     # 297
# length(unique(X.sel$rain36to50dap))        # 213
# length(unique(X.sel$rainto35dap))          # 210
# length(unique(X.sel$log_silt_clay))        # 305

# Compute partial dependence values. Going to write a wrapper function to get the partial values.
# NOTE: there is a bug in the pdp::partial function. The prob arg always returns probabilities, despite the default being FALSE. 

get.pd.df <- function(var, getprob) {
  # Get the partial dependence values on the probability scale
  # Args:
  #  var = quoted character string of the predictor's name
  #  getprob = logical to be passed to the prob argument of the partial function
  # Returns:
  #  a data frame with columns labelled `predvar` and `yhat`
  
  partial(m2, 
         train = X.sel,
         pred.var = var, 
         plot = FALSE,
         chull = TRUE, 
         progress = FALSE,
         type = "classification",  # the type of model we have
         which.class = 2,          # Pr(wm=1) is in the 2nd column of probs returned by ranger
         grid.resolution = 100,
         prob = getprob) %>% 
    # Rename the 1st column, which automatically gets the var's name, to predvar
    # Doing this to make it easier (later on) to pass this data frame to a wrapper plotting function
    dplyr::rename_with(~ "predvar", .cols = all_of(var))
}
  

my.pdp.plot <- function(var) {
  # Partial dependence plot
  # Args:
  #  var = quoted character string of the predictor's name
  # Returns:
  #  a ggplot representation of the pdp
  
  pd <- get.pd.df(var, getprob = T)
  
  ggplot(pd, aes(x = predvar, y = yhat)) +
  geom_line(color = "orange", linewidth = 1) +
  geom_rug(color = "grey50", alpha = 0.5, sides = "l") +
  labs(x = wm_vip_labels[var],
       y = "Marginal Probability") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray90"),
        panel.grid.minor = element_line(color = "gray95"),
        # Make x-axis labels smaller to reduce crowding:
        axis.title.x = element_text(face = "bold", size = 6),
        axis.title.y = element_text(face = "bold", size = 8))
}

# Do the pdp's for the 7 vars: 
p1 <- my.pdp.plot("t2m_mean_to_4dap")
p2 <- my.pdp.plot("sm_40dap_to_49dap")
p3 <- my.pdp.plot("stsm_35dap_to_44dap")
p4 <- my.pdp.plot("sm_5dap_to_15dap")
p5 <- my.pdp.plot("rain36to50dap")
p6 <- my.pdp.plot("rainto35dap")
p7 <- my.pdp.plot("log_silt_clay")

# Combine the plots, collect the common y-axis labels:
p1 + p2 + p3 + p4 + p5 + p6 + p7 + plot_layout(ncol = 3, axis_titles = "collect")

Partial dependence plots for the VSURF-selected variables used in fitting a random forest model for predicting white mold in snap bean. The y-axis is on the probability scale.

Logit scale

In the partial dependence function, the values are on a centered logit scale, a scale similar to the logit.

my.pdp.plot.logit <- function(var) {
  # Partial dependence plot on the logit scale
  # Args:
  #  var = quoted character string of the predictor's name
  # Returns:
  #  a ggplot representation of the pdp
  
  pd <- get.pd.df(var, getprob = FALSE)

  ggplot(pd, aes(x = predvar, y = yhat)) +
  geom_line(color = "orange", linewidth = 1) +
  geom_rug(color = "grey50", alpha = 0.5, sides = "l") +
  labs(x = wm_vip_labels[var],
       y = "Centered logit") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray90"),
        panel.grid.minor = element_line(color = "gray95"),
        # Make x-axis labels smaller to reduce crowding:
        axis.title.x = element_text(face = "bold", size = 6),
        axis.title.y = element_text(face = "bold", size = 8))
}



p1 <- my.pdp.plot.logit("t2m_mean_to_4dap")
p2 <- my.pdp.plot.logit("sm_40dap_to_49dap")
p3 <- my.pdp.plot.logit("stsm_35dap_to_44dap")
p4 <- my.pdp.plot.logit("sm_5dap_to_15dap")
p5 <- my.pdp.plot.logit("rain36to50dap")
p6 <- my.pdp.plot.logit("rainto35dap")
p7 <- my.pdp.plot.logit("log_silt_clay")

# Combine the plots, collect the common y-axis labels:
p1 + p2 + p3 + p4 + p5 + p6 + p7 + plot_layout(ncol = 3, axis_titles = "collect")

Partial dependence plots for the VSURF-selected variables used in fitting a random forest model for predicting white mold in snap bean. The y-axis is on the logit scale.

Feature interactions

Estimate the interaction between any two features, which tells us how strongly two specific features interact with each other in the model.

H-statistics

The variable with the strongest interaction signal is the soil temperature to volumetric soil water content from 35 to 44 dap (stsm_35dap_to_44dap).

# Set the seed for reproducibility and increase the grid size from the default. The interactions are not strong, and without the seed set, there is variability in the results.  The increased grid size also helps mitigate against the variability. 
set.seed(456)
tictoc::tic()
interact <- iml::Interaction$new(components_iml, grid.size = 100)
Warning: pacote 'TH.data' foi compilado no R versão 4.4.2
Warning: pacote 'DiceKriging' foi compilado no R versão 4.4.2
Warning: pacote 'sandwich' foi compilado no R versão 4.4.2
Warning: pacote 'zoo' foi compilado no R versão 4.4.2
Warning: pacote 'gt' foi compilado no R versão 4.4.2
Warning: pacote 'rprojroot' foi compilado no R versão 4.4.2
Warning: pacote 'randomForest' foi compilado no R versão 4.4.2
Warning: pacote 'here' foi compilado no R versão 4.4.2
Warning: pacote 'withr' foi compilado no R versão 4.4.2
Warning: pacote 'doParallel' foi compilado no R versão 4.4.2
Warning: pacote 'htmlTable' foi compilado no R versão 4.4.2
Warning: pacote 'R.utils' foi compilado no R versão 4.4.2
Warning: pacote 'parallelMap' foi compilado no R versão 4.4.2
Warning: pacote 'Metrics' foi compilado no R versão 4.4.2
Warning: pacote 'markdown' foi compilado no R versão 4.4.2
Warning: pacote 'bookdown' foi compilado no R versão 4.4.2
Warning: pacote 'svglite' foi compilado no R versão 4.4.2
Warning: pacote 'mvtnorm' foi compilado no R versão 4.4.2
Warning: pacote 'fastmatch' foi compilado no R versão 4.4.2
Warning: pacote 'multcomp' foi compilado no R versão 4.4.2
Warning: pacote 'cards' foi compilado no R versão 4.4.2
tictoc::toc()  # ~ 11 sec

# Only 7 rows of course, as there are only 7 vars:
iml.inter.results <- interact$results

# Reminder: Vector of the vars:
selected_vars

# A named vector of the labels:
inter_labels <-
  wm_metadata %>% 
  dplyr::filter(variable %in% selected_vars) %>% 
  tibble::deframe()

# Plot with the more descriptive variable labels: 
iml.inter.results %>%
  dplyr::arrange(desc(.interaction)) %>% 
  # use reorder() to sort the plot so highest importance is at the top:
  ggplot(., aes(y = reorder(.feature, .interaction), x = .interaction)) +  
  geom_point(size = 3, color = "orange") +
  scale_y_discrete(labels = inter_labels, name = NULL) +
  theme_minimal() +
  scale_x_continuous(name = "Overall interaction strength", limits = c(0, 0.35)) + 
  theme(axis.title.x = element_text(face = "bold", size = 11))

H-statistic for interactions among the variables in a random forest model for forecasting white mold in snap bean.
# Highest:
# stsm_35dap_to_44dap

stsm_35dap_to_44dap

Now we look at the interaction between stsm_35dap_to_44dap and all other features, which tells us how strongly (in total) stsm_35dap_to_44dap interacts in the model with all the other features.

We see that stsm_35dap_to_44dap interacts mostly with sm_5dap_to_15dap and rainto35dap.

interact_2way <- iml::Interaction$new(components_iml, feature = "stsm_35dap_to_44dap")

interact_2way$results %>% 
  dplyr::arrange(desc(.interaction)) %>% 
  make_kable()
.feature .interaction
1 sm_5dap_to_15dap:stsm_35dap_to_44dap 0.329
2 rainto35dap:stsm_35dap_to_44dap 0.288
3 rain36to50dap:stsm_35dap_to_44dap 0.136
4 log_silt_clay:stsm_35dap_to_44dap 0.115
5 sm_40dap_to_49dap:stsm_35dap_to_44dap 0.114
6 t2m_mean_to_4dap:stsm_35dap_to_44dap 0.112
sm_5dap_to_15dap and stsm_35dap_to_44dap
# Two-way PDP using iml:
interaction_pdp <- iml::FeatureEffect$new(
  components_iml, 
  c("sm_5dap_to_15dap", "stsm_35dap_to_44dap"), 
  method = "pdp",
  grid.size = 30
) 

# Default output is a heatmap:
# plot(interaction_pdp)

# A wireframe plot is more informative:
# See: https://www.stat.math.ethz.ch/pipermail/r-help/2009-August/400463.html
# for how to make adjustments to the tick, axis and legend labels.
wireframe(.value ~ sm_5dap_to_15dap*stsm_35dap_to_44dap, 
          data = interaction_pdp$results,
          # Adjust the size of the axis labels:
          xlab = list(cex = .7),
          ylab = list(cex = .7),
          zlab = list(label = "Pr(wm)", cex = 0.7),
          main = NULL,
          drape = TRUE,
          colorkey = TRUE,
          screen = list(z = -60, x = -60),
          # Change tick label size on the x, y, z axes:
          scales = list(arrows = FALSE, x = list(cex = .5), 
                        y = list(cex = .5), z = list(cex = 0.5)),
          # Change the tick label size on the legend:
          par.settings = list(axis.text = list(cex = 0.5))
          )

SHAP – (SHapley Additive exPlanations)

# Select background data. 
# Approach 1
#  Because of the imbalance in the data, create a subsample which consists of all wm(1) obs, plus 127 wm(0) obs, to give a sample of 200 rows:
# set.seed(461)
# bg_X <- bind_rows(X.sel %>% dplyr::filter(wm == 1),
#           X.sel %>% dplyr::filter(wm == 0) %>% dplyr::slice_sample(n = 127))

# Crunch SHAP values for all 200 rows:
# Note: Since the number of features is small, we use permshap()
# tic()
# ps <- kernelshap::permshap(m2, X.sel[-8], bg_X = bg_X)
# toc()  # ~ 1 min


# Approach 2 (which we use here)
# Using all the data:
tictoc::tic()
# ps <- kernelshap::permshap(m2, X.sel[-8], bg_X = X.sel)  # wm is in the 8th column
# Setting it up this way, to keep the wm variable for labelling dependence plots later
ps <- kernelshap::permshap(m2, X.sel, bg_X = X.sel, feature_names = names(X.sel))
tictoc::toc() # ~ 214 sec

# Create the viz object:
sv <- shapviz(ps)

# Save the object to avoid having to repeat recreating the sv object:
save(sv, file = here::here("Modeling", "shapvis.RData"))
load(here::here("Modeling", "shapvis.RData"))  # sv

SHAP Importance Plots

A beeswarm plot (sometimes called “SHAP summary plot”).

The features are sorted in decreasing order of importance.

# wm_vip_labels is defined in chunk `sel-RF-model-vip`
# With all 356 data points, the beeswarm plot can look too cluttered, can't tell much. So set a transparency via alpha = 0.5 and smaller points.

# We need some further manipulations so that the plot does not include a row for wm. 
# sv is a list of two: 0 and 1. We want the values under 1 corresponding to wm=1
# The list under 1 consists of two parts: S (matrix format), X (dataframe)
# We need to remove the wm parts from S and X.
# Make and work with a copy of sv:
sv.mod <- sv

sv.mod[["1"]]$S <- sv.mod[["1"]]$S[, -8]  # because S is a matrix
sv.mod[["1"]]$X <- sv.mod[["1"]]$X[-8]    # because X is a dataframe

sv_importance(sv.mod[["1"]], kind = "bee", alpha = 1, size = 1) +
  scale_y_discrete(labels = wm_vip_labels, name = NULL) +
  scale_color_gradient(low ="#1e88e5ff", high = "#ff0d57ff",limits =c(0,1),
                       breaks = c(0, 1), labels = c(" Low", "High "),
                       guide = guide_colorbar(barwidth = 0.3,barheight =15 ))+
  cowplot::theme_half_open(font_size = 12)
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Permutation-based SHAP summary plot depicting the contributions of variables in a random forest model to the prediction of white mold in snap bean.
ggsave("figs/shap_beeswarm.png", dpi = 600, height = 5, width = 7, bg = "white")
ggsave("figs/shap_beeswarm.pdf", dpi = 600, height = 5, width = 7, bg = "white")

SHAP Dependence Plot

We focus here on pure main effects

# Reminder of the features:
# names(X.sel)

# Examples:
# sv_dependence(sv[["1"]], v = "sm_40dap_to_49dap", color_var = "stsm_35dap_to_44dap")
# sv_dependence(sv[["1"]], v = "sm_40dap_to_49dap", color_var = "rainto35dap")

# A wrapper function to do a main dependence plot (no interactions shown)
my_dependence_fxn <- function(i) {
  # Args:
  #  i = numeric index 1-7 to pull out a var from the respective vector
  # Returns:
  #  an edited dependence plot
  
  sv_dependence(sv[["1"]], selected_vars[i], color_var = NULL, color = "orange", size=1) +
  geom_smooth(method = "loess", formula = 'y ~ x', se = FALSE) +
  xlab(wm_vip_labels[selected_vars[i]]) + 
  theme_bw() +
  theme(axis.title.y = element_text(face = "bold", size = 8)) +
  theme(
        axis.text.x = element_text(size = 7),  # Decrease x-axis tick label size
        axis.text.y = element_text(size = 7)  # Decrease y-axis tick label size
        ) +
  theme(axis.title.x = element_text(face = "bold", size = 8)) 
}
# The main goal is to color the points in the plot by wm status (i.e., use separate colors for the wm=0 and wm=1 groups)

# The best way to do this was to extract the relevant parts out of sv and create a tibble that can be passed directly into ggplot.

my_dependence_fxn <- function(i) {
  # Args:
  #  i = numeric index 1-7 to pull out a var from the respective vector
  # Returns:
  #  an edited dependence plot
  
  # Define a colorblind-friendly palette
  cbPalette <- c("#999999", "#E69F00")  # Two colors for binary factor levels
  
  foo <- tibble(x = sv[["1"]]$X[, i], y = sv[["1"]]$S[, i], wm = sv[["1"]]$X[, 8])
  
  ggplot(data = foo, aes(x = x, y = y, color = wm)) +
    geom_point(size = 1) +  
    # Single smooth line
    geom_smooth(method = "loess", formula = 'y ~ x', se = FALSE, 
                color = "blue1", aes(group = 1)) +  
    # Apply colorblind-friendly palette
    scale_color_manual(values = cbPalette) +  
    theme_bw() +
    xlab(wm_vip_labels[selected_vars[i]]) + 
    ylab("SHAP value") +
    theme_bw() +
    theme(axis.title.y = element_text(face = "bold", size = 8)) +
    theme(
      axis.text.x = element_text(size = 7),  # Decrease x-axis tick label size
      axis.text.y = element_text(size = 7)  # Decrease y-axis tick label size
    ) +
    theme(axis.title.x = element_text(face = "bold", size = 8)) 
}
p1 <- my_dependence_fxn(1)
p2 <- my_dependence_fxn(2)
p3 <- my_dependence_fxn(3)
p4 <- my_dependence_fxn(4)
p5 <- my_dependence_fxn(5)
p6 <- my_dependence_fxn(6)
p7 <- my_dependence_fxn(7)

# This places the legend guide in the unoccupied panel
(p1 + p2 + p3 + p4 + p5 + p6 + p7 + guide_area()) +
  plot_annotation(tag_levels = "A") +
  plot_layout(ncol = 2, byrow = TRUE, guides = "collect", axis_titles = "collect") & 
  theme(legend.position = "bottom", 
        legend.direction = "horizontal")

Main effects SHAP dependence plots for the seven predictors in a random forest model for white mold in snap bean.

Prediction

We now turn to logistic regression to build a model.

LR model I

  • Predictors represented in the model as-is
  • No representation as restricted cubic splines (to account for nonlinearities in the logit)
  • No penalization
load(here::here("Modeling", "rf_tuned_sel.RData"))  # rf.tuned.sel, X.sel, selected_vars 
# rm(list = ls()[!(ls() %in% c("X.sel", "make_kable"))])
# Makes life easier to have wm as a numeric (0,1) instead of as a factor:
X.sel <- 
  X.sel %>% 
  dplyr::mutate(wm = as.numeric(wm)-1)

# Create a datadist object
dd <- rms::datadist(X.sel)

options(datadist = "dd")

fmla <- formula(paste("wm", "~", 
                      paste(c("sm_40dap_to_49dap",
                              "t2m_mean_to_4dap",
                              "stsm_35dap_to_44dap",
                              "sm_5dap_to_15dap", 
                              "log_silt_clay", 
                              "rainto35dap", 
                              "rain36to50dap"),
                            collapse = "+")))



# NOTE: getting this warning message, and have not been able to solve it. 
# From what I read, this is a software bug. Therefore, I am wrapping the code in suppressWarnings...
# Warning message:
# In formula.character(object, env = baseenv()) :
#   Using formula(x) is deprecated when x is a character vector of length > 1.
#   Consider formula(paste(x, collapse = " ")) instead.

# Set x = T and y = T to be able to compute bootstraps later for assessing optimism:
M0 <- rms::lrm(fmla, data = X.sel, x = T, y = T)

suppressWarnings(pred <- predict(M0, type = "fitted.ind"))

# No. obs:
n <- as.vector(M0$stats["Obs"])

# LR = Likelihood Ratio
LR <- as.vector(M0$stats["Model L.R."])

# Max CS_app:
Max_CS_app <- as.vector(1 - exp(-M0$deviance[1]/n))

# Model summary:
print(M0, coef = T)

# The no. of predictor parameters:
as.vector(M0$stats["d.f."])

## IF you have the original development data, you can adjust for optimism via bootstrapping:
set.seed(2022)
M0.boot <- rms::validate(M0, B = 1000)


# Set up for model performance statistics via the CalibrationCurves package:
calPerf <- suppressGraphics({
  CalibrationCurves::val.prob.ci.2(pred, X.sel$wm)
})


# Get set up for using the `pminternal` package.  See the file Troubleshooting.R for more details. Basically, to avoid error messages and conflicts, we (i) do not load `pminternal`, (ii) write custom model fitting and predict functions.
model_fun <- function(data, ...) {
  glm(wm ~ 
        sm_40dap_to_49dap +
        t2m_mean_to_4dap +
        stsm_35dap_to_44dap +
        sm_5dap_to_15dap +
        log_silt_clay +
        rainto35dap +
        rain36to50dap, 
      data = data, family = "binomial")
}


pred_fun <- function(model, data, ...) {
  predict(model, newdata = data, type = "response")
}

# for calibration plot:
eval <- seq(min(pred), max(pred), length.out = 200)

set.seed(2022)
# Takes ~ 32 sec
mod_iv <- pminternal::validate(method = "boot_optimism", 
                               data = X.sel, 
                               outcome = "wm", 
                               model_fun = model_fun, 
                               pred_fun = pred_fun, 
                               B = 2000, 
                               cores = 4,
                               # Here we use loess:
                               calib_args = list(smooth = "loess", eval = eval))

Model summary

# Model summary:
print(M0, coef = T)
Logistic Regression Model

rms::lrm(formula = fmla, data = X.sel, x = T, y = T)

                       Model Likelihood      Discrimination    Rank Discrim.    
                             Ratio Test             Indexes          Indexes    
Obs           356    LR chi2      42.67      R2       0.177    C       0.728    
 0            283    d.f.             7      R2(7,356)0.095    Dxy     0.456    
 1             73    Pr(> chi2) <0.0001    R2(7,174.1)0.185    gamma   0.456    
max |deriv| 1e-08                            Brier    0.140    tau-a   0.149    

                    Coef     S.E.   Wald Z Pr(>|Z|)
Intercept            -7.5358 3.1717 -2.38  0.0175  
sm_40dap_to_49dap    13.5259 7.7544  1.74  0.0811  
t2m_mean_to_4dap      0.3344 0.0726  4.61  <0.0001 
stsm_35dap_to_44dap  -0.0150 0.0155 -0.97  0.3322  
sm_5dap_to_15dap    -16.5571 7.5436 -2.19  0.0282  
log_silt_clay         0.9923 0.8333  1.19  0.2337  
rainto35dap           0.0038 0.0041  0.93  0.3521  
rain36to50dap        -0.0037 0.0071 -0.52  0.6020  

Fitted model formula

Output a function for obtaining predictions on the logit sale from the fitted LR model.

Function(M0)
function (sm_40dap_to_49dap = 0.30222176, t2m_mean_to_4dap = 20.353918, 
    stsm_35dap_to_44dap = 65.915896, sm_5dap_to_15dap = 0.30405061, 
    log_silt_clay = 1.102371, rainto35dap = 114.4, rain36to50dap = 46.65) 
{
    -7.5357792 + 13.525937 * sm_40dap_to_49dap + 0.33437285 * 
        t2m_mean_to_4dap - 0.014985675 * stsm_35dap_to_44dap - 
        16.557108 * sm_5dap_to_15dap + 0.99230044 * log_silt_clay + 
        0.0037716469 * rainto35dap - 0.0037190062 * rain36to50dap
}
<environment: 0x0000026ed7761b68>

Model fit statistics

We’ll focus on statistics that Riley and colleagues have suggested should be reported for logistic regression models.

No. predictor variables

length(M0$Design$name) %>% 
  make_kable(col.names = "No. predictors")
No. predictors
1 7

No. predictor parameters

as.vector(M0$stats["d.f."]) %>% 
  make_kable(col.names = "No. predictor params")
No. predictor params
1 7

Events per predictor param

(sum(M0$y)/as.vector(M0$stats["d.f."])) %>% 
  make_kable(col.names = "EPP")
EPP
1 10.4

Max Cox-Snell and LR

tibble::tibble(`Max Cox-Snell (app)` = Max_CS_app, `Likelihood ratio` = LR) %>% 
  make_kable()
Max Cox-Snell (app) Likelihood ratio
1 0.637 42.7

Nagelkerke

Nagelkerke <- as.vector(M0$stats["R2"])

# Bootstrap CI for Nagelkerke R2:
g <- rms::bootcov(M0, stat = 'R2', seed = 4, B = 1000)
# The CI limits:
g.quan <- quantile(g$boot.stats, c(.025, .975))
# Lower CI bound:
Nagelkerke.LL <- g.quan[1]
# Upper CI bound:
Nagelkerke.UL <- g.quan[2]

# Internal validation: optimism-adjusted Nagelkerke's R2 from the bootstrap validation results:
Nagelkerke_adj <- M0.boot[2, 5]

res_Nagelkerke <- matrix(
  c(Nagelkerke,
    Nagelkerke.LL,
    Nagelkerke.UL,
    Nagelkerke_adj,
    NA,
    NA),
  
  nrow = 1,
  ncol = 6,
  byrow = T,
  
  dimnames = list(
    c("Nagelkerke R2"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2)
  )
)

res_Nagelkerke %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, 
                     "Apparent" = 3, 
                     "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
Nagelkerke R2 0.177 0.116 0.321 0.121 NA NA

Cox-Snell

# CS_app = Apparent estimate of the Cox-Snell statistic:
# Point estimate and 95% CI bounds:
CS_app <- c(Nagelkerke, Nagelkerke.LL, Nagelkerke.UL)*Max_CS_app

# Adjusted for optimism:
CS_adj <- Nagelkerke_adj*Max_CS_app

res_CS <- matrix(
  c(CS_app,
    CS_adj,
    NA,
    NA),
  
  nrow = 1,
  ncol = 6,
  byrow = T,
  
  dimnames = list(
    c("Cox-Snell R2"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2)
  )
)

res_CS %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, 
                     "Apparent" = 3, 
                     "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
Cox-Snell R2 0.113 0.074 0.205 0.077 NA NA

Brier score

# Brier score:
Brier <- as.vector(M0$stats["Brier"])
# Bootstrap CI for Brier score:
g <- rms::bootcov(M0, stat = 'Brier', seed = 4, B = 1000)
# The CI limits:
g.quan <- quantile(g$boot.stats, c(.025, .975))
# Lower CI bound:
Brier.LL <- g.quan[1]
# Upper CI bound:
Brier.UL <- g.quan[2]

Brier_app <- c(Brier, Brier.LL, Brier.UL)

# Adjusted for optimism via rms:
Brier_adj <- M0.boot["B", "index.corrected"]

res_Brier <- matrix(
  c(Brier_app,
    Brier_adj,
    NA,
    NA),
  
  nrow = 1,
  ncol = 6,
  byrow = T,
  
  dimnames = list(
    c("Brier"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2)
  )
)

res_Brier %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, 
                     "Apparent" = 3, 
                     "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
Brier 0.14 0.111 0.16 0.148 NA NA

Calibration intercept

Also called Calibration-in-the-large

Ccurves_app <- calPerf$Calibration$Intercept

# Corrected for optimism:
# summary(mod_iv)["Corrected", "Intercept"]


res_cil <- matrix(c(
  # From the CalibrationCurves package:
  Ccurves_app["Point estimate"],
  Ccurves_app["Lower confidence limit"],
  Ccurves_app["Upper confidence limit"],
  
  # Corrected for optimism:
  summary(mod_iv)["Corrected", "Intercept"],
  NA,
  NA
  ),
  nrow = 1,
  ncol = 6, 
  byrow = TRUE,
  dimnames = list(
    c("Calibration-in-the-large"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2)
  )
)


res_cil %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, 
                     "Apparent" = 3, 
                     "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
Calibration-in-the-large 0 -0.275 0.275 -0.181 NA NA

Calibration slope

# Via the CalibrationCurves package:
CC_slope_app <- calPerf$Calibration$Slope


# Table the results:
res_weak_cal <- matrix(c(
  CC_slope_app["Point estimate"],
  CC_slope_app["Lower confidence limit.2.5 %"],
  CC_slope_app["Upper confidence limit.97.5 %"],
  # Adjusting for optimism:
  summary(mod_iv)["Corrected", "Slope"],
  NA,
  NA
  ),
  nrow = 1,
  ncol = 6, 
  byrow = TRUE,
  dimnames = list(
    c("Calibration slope"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2))
)

res_weak_cal %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, "Apparent" = 3, "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
Calibration slope 1 0.664 1.34 0.836 NA NA

Calibration plot

Logistic regression models are typically well-calibrated on the development data; and we see aspects of that here.

However, the model does have some issues.

# Using ggplot2:
LR1CC <- CalibrationCurves::valProbggplot(pred, X.sel$wm)$ggPlot

LR1CC

Calibration plot for a logistic regression model for predicting the presence of white mold in snap bean.

Predicted probabilities

tibble::tibble(response = pred, truth = X.sel$wm) %>%
  ggplot(aes(response)) +
  geom_histogram(fill = "orange", col = "orange3", bins = 20) +
  facet_wrap(~ as.factor(truth), ncol = 1, 
             labeller = as_labeller(c("0" = "No White Mold", "1" = "White Mold Present"))) +
  geom_rug(col = "blue", alpha = 0.5) + 
  labs(x = "Probability estimate of white mold", y = "No. of fields") +
  theme_bw()

Histogram of fitted probabilities for a logistic regression model fit to a subset of predictors identified by the VSURF algorithm.

ICI

Apparent
# As a background check:
# Get the apparent estimate using CalibrationCurves, which generates a flexible calibration curve based on loess as the default.

calPerf$stats["Eavg"]

The calculations are done by hand, making heavy use of code by Daniele Giardiello. A CI for the estimate of the apparent ICI is calculated via bootstrap. Complicated enough as it is. No internal validation (correction for optimism) here; better (easier) obtained with pminternal.

# Calibration measures ICI, E50, E90 based on secondary logistic regression
# Create a copy of the X.sel data frame, with pred added as a column:
Xmod <-
  X.sel %>% 
  dplyr::mutate(pred = pred)

# Use loess to fit the flexible calibration curve:
fit_cal <- loess(wm ~ pred, 
                 data = Xmod, 
                 span = 0.75, 
                 degree = 2, 
                 family = "gaussian")

cal_obs <- predict(fit_cal)

dt_cal <- 
  cbind.data.frame(
    "obs" = cal_obs,
    "pred" = pred)


res_calmeas <-
  c(
    "ICI" = mean(abs(dt_cal$obs - dt_cal$pred)),
    "E50" = median(abs(dt_cal$obs - dt_cal$pred)),
    "E90" = unname(quantile(abs(dt_cal$obs - dt_cal$pred), 
                            probs = .90))
)


## Bootstrap confidence intervals for the calibration  measures (ICI, E50, E90)
alpha <- .05
# Set B = 2000 although it takes more time:
B <- 2000 
set.seed(2022)
# Using stratification on wm because of the imbalance between the classes:
vboot <- rsample::bootstraps(X.sel, times = B, strata = wm)

# Bootstrap calibration measures
numsum_boot <- function(split) {
  
  pred <- suppressWarnings(predict(M0,
                  type = "fitted.ind",
                  newdata = rsample::analysis(split)
                  ) )

  v_cal <- loess(wm ~ pred, 
                 data = rsample::analysis(split), 
                 span = 0.75, 
                 degree = 2, 
                 family = "gaussian")

  cal_obs_boot <- predict(v_cal)
  
  # Save objects needed
  db_cal_boot <- data.frame(
    "obs" = cal_obs_boot,
    "pred" = pred
    )
  
  absdiff_boot <- abs(db_cal_boot$obs - db_cal_boot$pred)

  res_cal_boot <- data.frame(
    "ICI" = mean(absdiff_boot),
    "E50" = quantile(absdiff_boot, probs = .5),
    "E90" = quantile(absdiff_boot, probs = .9)
  )
  }  # end of function

# Example of use:
# (bar <- purrr::pluck(vboot, "splits", 1) %>% numsum_boot())

tictoc::tic()
numsum_b <- 
  vboot %>% 
  dplyr::mutate(num_cal_boot = purrr::map(splits, numsum_boot),
                
                ICI = purrr::map_dbl(num_cal_boot, ~ .x$ICI),
         
                E50 = purrr::map_dbl(num_cal_boot, ~ .x$E50),
         
                E90 = purrr::map_dbl(num_cal_boot, ~ .x$E90)
         )

tictoc::toc()  # 9 sec
9.69 sec elapsed
# numsum_b is a tibble with 2,000 rows
# Now we need to process the results to get the bootstrap CIs:
alpha <- .05
res_numcal <- matrix(c(res_calmeas["ICI"],
                       quantile(numsum_b$ICI, probs = alpha / 2),
                       quantile(numsum_b$ICI, 
                                probs = 1 - alpha / 2),
                       
                       res_calmeas["E50"],
                       quantile(numsum_b$E50, probs = alpha / 2),
                       quantile(numsum_b$E50, probs = 1 - alpha / 2),
                       
                       res_calmeas["E90"],
                       quantile(numsum_b$E90, probs = alpha / 2),
                       quantile(numsum_b$E90, probs = 1 - alpha / 2)
                       ),
                     nrow = 1,
                     ncol = 9,
                     byrow = T,
                     dimnames = list(
                       c("Apparent"),
                       rep(
                         c("Estimate", "Lower.95", "Upper.95"),
                         3))
)


# Present the results:
res_numcal %>% 
  make_kable()  %>% 
  add_header_above(c(" " = 1,  
                     "ICI" = 3, 
                     "E50" = 3,
                     "E90" = 3))
ICI
E50
E90
Estimate Lower.95 Upper.95 Estimate Lower.95 Upper.95 Estimate Lower.95 Upper.95
Apparent 0.031 0.017 0.069 0.021 0.009 0.06 0.059 0.032 0.142
Internal validation
# ICI adjusted for optimism:

# oc = optimism-corrected
optsum <-
  c(
    ICI.app = summary(mod_iv)["Apparent", "Eavg"],
    ICI.oc = summary(mod_iv)["Corrected", "Eavg"],
    E50.app = summary(mod_iv)["Apparent", "E50"],
    E50.oc = summary(mod_iv)["Corrected", "E50"],
    E90.app = summary(mod_iv)["Apparent", "E90"],
    E90.oc = summary(mod_iv)["Corrected", "E90"]
    )

res_optcal <- matrix(c(optsum["ICI.app"],
                       optsum["ICI.oc"],
                       
                       optsum["E50.app"],
                       optsum["E50.oc"],
                       
                       optsum["E90.app"],
                       optsum["E90.oc"]
                       ),
                     nrow = 1,
                     ncol = 6,
                     byrow = T,
                     dimnames = list(
                       c("Value"),
                       rep(c("Apparent", "Corrected"), 3))
                     )

# Present the results:
res_optcal %>% 
  make_kable()  %>% 
  add_header_above(c(" " = 1,  
                     "ICI" = 2, 
                     "E50" = 2,
                     "E90" = 2))
ICI
E50
E90
Apparent Corrected Apparent Corrected Apparent Corrected
Value 0.027 0.033 0.014 0.021 0.057 0.073

C-statistic

Equivalent to the AUROC in the logistic regression model setting.

The CI is a bit wide, as would be expected for a small dataset. Correction for optimism still indicates a good model (at least by this measure).

# With rms::bootcov
# C-statistic (apparent)
# as.vector(M0$stats["C"])

# Bootstrap CI for the C-statistic:
g <- rms::bootcov(M0, stat = 'C', seed = 4, B = 1000)
# The CI limits:
g.quan <- quantile(g$boot.stats, c(.025, .975))
# Lower CI bound:
# g.quan[1]
# Upper CI bound:
# g.quan[2]

# Optimism-corrected:
# The 1st row in M0.boot is Dxy, which is Somer's D. It is a transformed version of the C-statistic
C_adj <- 0.5*(M0.boot[1, 5] + 1)

res_C <- matrix(c(as.vector(M0$stats["C"]),
                  g.quan[1],
                  g.quan[2],
                  C_adj,
                  NA,
                  NA
                  ),
                nrow = 1,
                ncol = 6,
                byrow = T,
                dimnames = list(
                  c(""),
                  rep(c("Estimate", "Lower.95", "Upper.95"), 2))
                )

# Present the results:
res_C %>% 
  make_kable()  %>% 
  add_header_above(c(" " = 1,  
                     "Apparent" = 3, 
                     "Internal" = 3
                     )
                   )
Apparent
Internal
Estimate Lower.95 Upper.95 Estimate Lower.95 Upper.95
0.728 0.681 0.812 0.696 NA NA

Se, Sp, YI

# 1. Prepare predictions dataframe:
lr_pred <-
  data.frame(truth = X.sel$wm, response = pred)

# 2. Calculate apparent performance
lr_apparent_perf <- calc_metrics_cutpointr(lr_pred)

# 3. Bootstrap process
fmla <- formula(paste("wm", "~", 
                      paste(c("sm_40dap_to_49dap",
                              "t2m_mean_to_4dap",
                              "stsm_35dap_to_44dap",
                              "sm_5dap_to_15dap", 
                              "log_silt_clay", 
                              "rainto35dap", 
                              "rain36to50dap"),
                            collapse = "+")))


lr_boot_fxn <- function(i) {
  # Print the current iteration
  cat("Currently processing iteration: ", i, "\n")
  
  # a. Generate bootstrap sample
  boot_sample <- rsample::bootstraps(X.sel, times = 1, strata = wm)$splits[[1]] %>%
    rsample::analysis()
  
  # b. Fit model on bootstrap sample with error handling
  boot_model <- tryCatch(
    expr = {
       rms::lrm(fmla, data = boot_sample, x = T, y = T)
    },
    error = function(e) {
      cat("Error occurred in iteration", i, ":", e$message, "\n")
      return(NULL) # Return NULL if an error occurs
    }
  )
  
  # If boot_model is NULL, skip this iteration
  if (is.null(boot_model)) {
    cat("Skipping iteration", i, "due to error.\n")
    return(NULL)
  }
  
  
  # c. Calculate metrics on bootstrap sample (training performance)
  suppressWarnings(boot_pred <- predict(boot_model, type = "fitted.ind"))
  
  boot_pred_df <- data.frame(truth = boot_sample$wm, response = boot_pred)
  
  train_perf <- calc_metrics_cutpointr(boot_pred_df)
  
  # d & e. Apply bootstrap model to original dataset and calculate metrics (test performance)
  suppressWarnings(test_pred <- predict(boot_model, newdata = X.sel, type = "fitted.ind"))
  
  test_pred_df <- data.frame(truth = X.sel$wm, response = test_pred)
  
  test_perf <- calc_metrics_cutpointr(test_pred_df)
  
  # f. Calculate optimism
  optimism <- purrr::map2_dbl(train_perf, test_perf, \(x, y) x-y)
  
  tibble(
    iteration = i,
    metric = names(train_perf),
    train = unlist(train_perf),
    test = unlist(test_perf),
    optimism = optimism)
  }


# Run lr_boot_fxn in parallel:
set.seed(178)
my.seeds <- c(1:5000)

plan(list(tweak(multisession, workers = 5), sequential))
tictoc::tic()
bootstrap_results <- 
  furrr::future_map(my.seeds, lr_boot_fxn, .options = furrr_options(seed = TRUE)) %>% 
  purrr::list_rbind()
tictoc::toc()  # 43 sec

plan(sequential)

# 4. Apparent performance bootstrap confidence intervals
lr_apparent_CI <-
  bootstrap_results %>%
  dplyr::group_by(metric) %>%
  dplyr::summarize(app_LCI = quantile(train, .025), app_UCI = quantile(train, .975))


# 5. Average optimism across all bootstrap samples
avg_optimism <- 
  bootstrap_results %>%
  dplyr::group_by(metric) %>%
  dplyr::summarize(avg_optimism = mean(optimism))

# 6. Calculate optimism-corrected performance
lr_corrected_perf <- 
  tibble(
    metric = names(lr_apparent_perf),
    apparent = unlist(lr_apparent_perf)
    ) %>%
  dplyr::left_join(lr_apparent_CI, by = "metric") %>%
  dplyr::left_join(avg_optimism, by = "metric") %>%
  dplyr::mutate(corrected = apparent - avg_optimism)


# Save the results:
save(lr_pred, bootstrap_results, lr_apparent_perf, lr_corrected_perf, file = here::here("Modeling", "LRIoptimism.RData"))
# lr_pred, bootstrap_results, lr_apparent_perf, lr_corrected_perf
load(here::here("Modeling", "LRIoptimism.RData"))  

lr_corrected_perf %>% 
  make_kable()
metric apparent app_LCI app_UCI avg_optimism corrected
1 YI 0.353 0.311 0.546 0.066 0.287
2 Se 0.562 0.438 0.877 0.063 0.499
3 Sp 0.792 0.512 0.947 0.003 0.789
4 AUC 0.728 0.678 0.812 0.034 0.694
5 brier 0.140 0.119 0.150 -0.007 0.147

LR model II

As the SHAP plots indicate nonlinearity in the feature contributions, we’ll model the features using restricted cubic splines. This will be done via the rms package.

Restrict to the top 4 predictors:

  • t2m_mean_to_4dap
  • sm_40dap_to_49dap
  • stsm_35dap_to_44dap
  • rain36to50dap

Conduct an internal validation, as all the data will be used for model building (as we don’t have much data or wm cases, it’s wasteful not to use all the data to build the model). Internal validation will correct for optimism in the model build.

load(here::here("Modeling", "rf_tuned_sel.RData"))  # rf.tuned.sel, X.sel, selected_vars 
# rm(list = ls()[!(ls() %in% c("X.sel", "make_kable"))])
# Makes life easier to have wm as a numeric (0,1) instead of as a factor:
X.sel <- 
  X.sel %>% 
  dplyr::mutate(wm = as.numeric(wm)-1)

# Create a datadist object
dd <- rms::datadist(X.sel)

options(datadist = "dd")

fmla <- formula(paste("wm", "~", 
                      paste(c("rcs(t2m_mean_to_4dap, 3)",
                              "rcs(sm_40dap_to_49dap, 3)",
                              "rcs(stsm_35dap_to_44dap, 3)", 
                              "rcs(rain36to50dap, 3)"),
                            collapse = "+")))



# NOTE: getting this warning message, and have not been able to solve it. 
# From what I read, this is a software bug. Therefore, I am wrapping the code in suppressWarnings...
# Warning message:
# In formula.character(object, env = baseenv()) :
#   Using formula(x) is deprecated when x is a character vector of length > 1.
#   Consider formula(paste(x, collapse = " ")) instead.

# Set x = T and y = T to be able to compute bootstraps later for assessing optimism:
M0 <- rms::lrm(fmla, data = X.sel, x = T, y = T)

suppressWarnings(pred <- predict(M0, type = "fitted.ind"))

# No. obs:
n <- as.vector(M0$stats["Obs"])

# LR = Likelihood Ratio
LR <- as.vector(M0$stats["Model L.R."])

# Max CS_app:
Max_CS_app <- as.vector(1 - exp(-M0$deviance[1]/n))

# Model summary:
print(M0, coef = T)

# The no. of predictor parameters:
as.vector(M0$stats["d.f."])

## IF you have the original development data, you can adjust for optimism via bootstrapping:
set.seed(2022)
M0.boot <- rms::validate(M0, B = 1000)


# Set up for model performance statistics via the CalibrationCurves package:
calPerf <- suppressGraphics({
  CalibrationCurves::val.prob.ci.2(pred, X.sel$wm)
})


# Get set up for using the `pminternal` package.  See the file Troubleshooting.R for more details. Basically, to avoid error messages and conflicts, we (i) do not load `pminternal`, (ii) write custom model fitting and predict functions.
model_fun <- function(data, ...) {
  glm(wm ~ rms::rcs(t2m_mean_to_4dap, 3) + 
        rms::rcs(sm_40dap_to_49dap, 3) + 
        rms::rcs(stsm_35dap_to_44dap, 3) +
        rms::rcs(rain36to50dap, 3), 
      data = data, family = "binomial")
}


pred_fun <- function(model, data, ...) {
  predict(model, newdata = data, type = "response")
}

# for calibration plot:
eval <- seq(min(pred), max(pred), length.out = 200)

set.seed(2022)
# Takes ~ 32 sec
mod_iv <- pminternal::validate(method = "boot_optimism", 
                               data = X.sel, 
                               outcome = "wm", 
                               model_fun = model_fun, 
                               pred_fun = pred_fun, 
                               B = 2000, 
                               cores = 4,
                               # Here we use loess:
                               calib_args = list(smooth = "loess", eval = eval))

Model summary

# Model summary:
print(M0, coef = T)
Logistic Regression Model

rms::lrm(formula = fmla, data = X.sel, x = T, y = T)

                       Model Likelihood      Discrimination    Rank Discrim.    
                             Ratio Test             Indexes          Indexes    
Obs           356    LR chi2      71.55      R2       0.286    C       0.798    
 0            283    d.f.             8      R2(8,356)0.163    Dxy     0.594    
 1             73    Pr(> chi2) <0.0001    R2(8,174.1)0.306    gamma   0.596    
max |deriv| 8e-05                            Brier    0.129    tau-a   0.195    

                     Coef    S.E.    Wald Z Pr(>|Z|)
Intercept            -3.9761  4.8076 -0.83  0.4082  
t2m_mean_to_4dap     -0.1647  0.1280 -1.29  0.1983  
t2m_mean_to_4dap'     0.5193  0.1316  3.95  <0.0001 
sm_40dap_to_49dap     6.9182 14.0293  0.49  0.6219  
sm_40dap_to_49dap'   -8.4035 10.7696 -0.78  0.4352  
stsm_35dap_to_44dap   0.0082  0.0327  0.25  0.8013  
stsm_35dap_to_44dap' -0.0453  0.0700 -0.65  0.5170  
rain36to50dap         0.0768  0.0206  3.73  0.0002  
rain36to50dap'       -0.1091  0.0275 -3.97  <0.0001 

Fitted model formula

Output a function for obtaining predictions on the logit sale from the fitted LR model.

Function(M0)
function (t2m_mean_to_4dap = 20.353918, sm_40dap_to_49dap = 0.30222176, 
    stsm_35dap_to_44dap = 65.915896, rain36to50dap = 46.65) 
{
    -3.9761487 - 0.16469932 * t2m_mean_to_4dap + 0.010732301 * 
        pmax(t2m_mean_to_4dap - 15.749826, 0)^3 - 0.031740014 * 
        pmax(t2m_mean_to_4dap - 20.353918, 0)^3 + 0.021007713 * 
        pmax(t2m_mean_to_4dap - 22.70603, 0)^3 + 6.9181724 * 
        sm_40dap_to_49dap - 204.29019 * pmax(sm_40dap_to_49dap - 
        0.18214171, 0)^3 + 500.78374 * pmax(sm_40dap_to_49dap - 
        0.30222176, 0)^3 - 296.49355 * pmax(sm_40dap_to_49dap - 
        0.3849594, 0)^3 + 0.0082354322 * stsm_35dap_to_44dap - 
        8.732913e-06 * pmax(stsm_35dap_to_44dap - 50.395775, 
            0)^3 + 1.1130272e-05 * pmax(stsm_35dap_to_44dap - 
        65.915896, 0)^3 - 2.3973588e-06 * pmax(stsm_35dap_to_44dap - 
        122.45139, 0)^3 + 0.076765708 * rain36to50dap - 1.6521394e-05 * 
        pmax(rain36to50dap - 17.05, 0)^3 + 2.5989608e-05 * pmax(rain36to50dap - 
        46.65, 0)^3 - 9.4682141e-06 * pmax(rain36to50dap - 98.3, 
        0)^3
}
<environment: 0x0000026f0568dea8>

Model fit statistics

We’ll focus on statistics that Riley and colleagues have suggested should be reported for logistic regression models.

No. predictor variables

length(M0$Design$name) %>% 
  make_kable(col.names = "No. predictors")
No. predictors
1 4

No. predictor parameters

as.vector(M0$stats["d.f."]) %>% 
  make_kable(col.names = "No. predictor params")
No. predictor params
1 8

Events per predictor param

(sum(M0$y)/as.vector(M0$stats["d.f."])) %>% 
  make_kable(col.names = "EPP")
EPP
1 9.12

Max Cox-Snell and LR

tibble::tibble(`Max Cox-Snell (app)` = Max_CS_app, `Likelihood ratio` = LR) %>% 
  make_kable()
Max Cox-Snell (app) Likelihood ratio
1 0.637 71.5

Nagelkerke

Nagelkerke <- as.vector(M0$stats["R2"])

# Bootstrap CI for Nagelkerke R2:
g <- rms::bootcov(M0, stat = 'R2', seed = 4, B = 1000)
# The CI limits:
g.quan <- quantile(g$boot.stats, c(.025, .975))
# Lower CI bound:
Nagelkerke.LL <- g.quan[1]
# Upper CI bound:
Nagelkerke.UL <- g.quan[2]

# Internal validation: optimism-adjusted Nagelkerke's R2 from the bootstrap validation results:
Nagelkerke_adj <- M0.boot[2, 5]

res_Nagelkerke <- matrix(
  c(Nagelkerke,
    Nagelkerke.LL,
    Nagelkerke.UL,
    Nagelkerke_adj,
    NA,
    NA),
  
  nrow = 1,
  ncol = 6,
  byrow = T,
  
  dimnames = list(
    c("Nagelkerke R2"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2)
  )
)

res_Nagelkerke %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, 
                     "Apparent" = 3, 
                     "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
Nagelkerke R2 0.286 0.208 0.429 0.228 NA NA

Cox-Snell

# CS_app = Apparent estimate of the Cox-Snell statistic:
# Point estimate and 95% CI bounds:
CS_app <- c(Nagelkerke, Nagelkerke.LL, Nagelkerke.UL)*Max_CS_app

# Adjusted for optimism:
CS_adj <- Nagelkerke_adj*Max_CS_app

res_CS <- matrix(
  c(CS_app,
    CS_adj,
    NA,
    NA),
  
  nrow = 1,
  ncol = 6,
  byrow = T,
  
  dimnames = list(
    c("Cox-Snell R2"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2)
  )
)

res_CS %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, 
                     "Apparent" = 3, 
                     "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
Cox-Snell R2 0.182 0.133 0.274 0.145 NA NA

Brier score

# Brier score:
Brier <- as.vector(M0$stats["Brier"])
# Bootstrap CI for Brier score:
g <- rms::bootcov(M0, stat = 'Brier', seed = 4, B = 1000)
# The CI limits:
g.quan <- quantile(g$boot.stats, c(.025, .975))
# Lower CI bound:
Brier.LL <- g.quan[1]
# Upper CI bound:
Brier.UL <- g.quan[2]

Brier_app <- c(Brier, Brier.LL, Brier.UL)

# Adjusted for optimism via rms:
Brier_adj <- M0.boot["B", "index.corrected"]

res_Brier <- matrix(
  c(Brier_app,
    Brier_adj,
    NA,
    NA),
  
  nrow = 1,
  ncol = 6,
  byrow = T,
  
  dimnames = list(
    c("Brier"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2)
  )
)

res_Brier %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, 
                     "Apparent" = 3, 
                     "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
Brier 0.129 0.103 0.147 0.136 NA NA

Calibration intercept

Also called Calibration-in-the-large

Ccurves_app <- calPerf$Calibration$Intercept

# Corrected for optimism:
# summary(mod_iv)["Corrected", "Intercept"]


res_cil <- matrix(c(
  # From the CalibrationCurves package:
  Ccurves_app["Point estimate"],
  Ccurves_app["Lower confidence limit"],
  Ccurves_app["Upper confidence limit"],
  
  # Corrected for optimism:
  summary(mod_iv)["Corrected", "Intercept"],
  NA,
  NA
  ),
  nrow = 1,
  ncol = 6, 
  byrow = TRUE,
  dimnames = list(
    c("Calibration-in-the-large"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2)
  )
)


res_cil %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, 
                     "Apparent" = 3, 
                     "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
Calibration-in-the-large 0 -0.291 0.291 -0.153 NA NA

Calibration slope

# Via the CalibrationCurves package:
CC_slope_app <- calPerf$Calibration$Slope


# Table the results:
res_weak_cal <- matrix(c(
  CC_slope_app["Point estimate"],
  CC_slope_app["Lower confidence limit.2.5 %"],
  CC_slope_app["Upper confidence limit.97.5 %"],
  # Adjusting for optimism:
  summary(mod_iv)["Corrected", "Slope"],
  NA,
  NA
  ),
  nrow = 1,
  ncol = 6, 
  byrow = TRUE,
  dimnames = list(
    c("Calibration slope"),
    rep(c("Estimate", "Lower .95 ", "Upper .95"), 2))
)

res_weak_cal %>% 
  make_kable() %>% 
  add_header_above(c(" " = 1, "Apparent" = 3, "Internal" = 3))
Apparent
Internal
Estimate Lower .95 Upper .95 Estimate Lower .95 Upper .95
Calibration slope 1 0.725 1.27 0.857 NA NA

Calibration plot

Logistic regression models are typically well-calibrated on the development data; and we see aspects of that here.

However, the model does have some issues. Very low and high risks are overestimated. Intermediate risks are underestimated.

LRII model
# Using ggplot2:
LR2CC <- CalibrationCurves::valProbggplot(pred, X.sel$wm)$ggPlot

LR2CC

Calibration plot for a logistic regression model for predicting the presence of white mold in snap bean.
Comparison to the LRI model
(LR1CC + LR2CC) +
  plot_annotation(tag_levels = "A") +
   plot_layout(nrow = 2, byrow = TRUE, guides = "collect") & 
  cowplot::theme_half_open(font_size = 12) &
  theme(legend.position = "bottom", 
        legend.direction = "horizontal")

ggsave("figs/LR_calibration_plot.png", dpi = 600, height = 7, width = 5, bg = "white" )
ggsave("figs/LR_calibration_plot.pdf", dpi = 600, height = 7, width = 5, bg = "white" )

Predicted probabilities

tibble::tibble(response = pred, truth = X.sel$wm) %>%
  ggplot(aes(response)) +
  geom_histogram(fill = "orange", col = "orange3", bins = 20) +
  facet_wrap(~ as.factor(truth), ncol = 1, 
             labeller = as_labeller(c("0" = "No White Mold", "1" = "White Mold Present"))) +
  geom_rug(col = "blue", alpha = 0.5) + 
  labs(x = "Probability estimate of white mold", y = "No. of fields") +
  theme_bw()

Histogram of fitted probabilities for a logistic regression model fit to a subset of predictors identified by the VSURF algorithm.

ICI

Apparent
# As a background check:
# Get the apparent estimate using CalibrationCurves, which generates a flexible calibration curve based on loess as the default.

calPerf$stats["Eavg"]

The calculations are done by hand, making heavy use of code by Daniele Giardiello. A CI for the estimate of the apparent ICI is calculated via bootstrap. Complicated enough as it is. No internal validation (correction for optimism) here; better (easier) obtained with pminternal.

# Calibration measures ICI, E50, E90 based on secondary logistic regression
# Create a copy of the X.sel data frame, with pred added as a column:
Xmod <-
  X.sel %>% 
  dplyr::mutate(pred = pred)

# Use loess to fit the flexible calibration curve:
fit_cal <- loess(wm ~ pred, 
                 data = Xmod, 
                 span = 0.75, 
                 degree = 2, 
                 family = "gaussian")

cal_obs <- predict(fit_cal)

dt_cal <- 
  cbind.data.frame(
    "obs" = cal_obs,
    "pred" = pred)


res_calmeas <-
  c(
    "ICI" = mean(abs(dt_cal$obs - dt_cal$pred)),
    "E50" = median(abs(dt_cal$obs - dt_cal$pred)),
    "E90" = unname(quantile(abs(dt_cal$obs - dt_cal$pred), 
                            probs = .90))
)


## Bootstrap confidence intervals for the calibration  measures (ICI, E50, E90)
alpha <- .05
# Set B = 2000 although it takes more time:
B <- 2000 
set.seed(2022)
# Using stratification on wm because of the imbalance between the classes:
vboot <- rsample::bootstraps(X.sel, times = B, strata = wm)

# Bootstrap calibration measures
numsum_boot <- function(split) {
  
  pred <- suppressWarnings(predict(M0,
                  type = "fitted.ind",
                  newdata = rsample::analysis(split)
                  ) )

  v_cal <- loess(wm ~ pred, 
                 data = rsample::analysis(split), 
                 span = 0.75, 
                 degree = 2, 
                 family = "gaussian")

  cal_obs_boot <- predict(v_cal)
  
  # Save objects needed
  db_cal_boot <- data.frame(
    "obs" = cal_obs_boot,
    "pred" = pred
    )
  
  absdiff_boot <- abs(db_cal_boot$obs - db_cal_boot$pred)

  res_cal_boot <- data.frame(
    "ICI" = mean(absdiff_boot),
    "E50" = quantile(absdiff_boot, probs = .5),
    "E90" = quantile(absdiff_boot, probs = .9)
  )
  }  # end of function

# Example of use:
# (bar <- purrr::pluck(vboot, "splits", 1) %>% numsum_boot())


numsum_b <- 
  vboot %>% 
  dplyr::mutate(num_cal_boot = purrr::map(splits, numsum_boot),
                
                ICI = purrr::map_dbl(num_cal_boot, ~ .x$ICI),
         
                E50 = purrr::map_dbl(num_cal_boot, ~ .x$E50),
         
                E90 = purrr::map_dbl(num_cal_boot, ~ .x$E90)
         )

# numsum_b is a tibble with 2,000 rows
# Now we need to process the results to get the bootstrap CIs:
alpha <- .05
res_numcal <- matrix(c(res_calmeas["ICI"],
                       quantile(numsum_b$ICI, probs = alpha / 2),
                       quantile(numsum_b$ICI, 
                                probs = 1 - alpha / 2),
                       
                       res_calmeas["E50"],
                       quantile(numsum_b$E50, probs = alpha / 2),
                       quantile(numsum_b$E50, probs = 1 - alpha / 2),
                       
                       res_calmeas["E90"],
                       quantile(numsum_b$E90, probs = alpha / 2),
                       quantile(numsum_b$E90, probs = 1 - alpha / 2)
                       ),
                     nrow = 1,
                     ncol = 9,
                     byrow = T,
                     dimnames = list(
                       c("Apparent"),
                       rep(
                         c("Estimate", "Lower.95", "Upper.95"),
                         3))
)


# Present the results:
res_numcal %>% 
  make_kable()  %>% 
  add_header_above(c(" " = 1,  
                     "ICI" = 3, 
                     "E50" = 3,
                     "E90" = 3))
ICI
E50
E90
Estimate Lower.95 Upper.95 Estimate Lower.95 Upper.95 Estimate Lower.95 Upper.95
Apparent 0.018 0.016 0.056 0.017 0.009 0.051 0.034 0.03 0.132
Internal validation
# ICI adjusted for optimism:

# oc = optimism-corrected
optsum <-
  c(
    ICI.app = summary(mod_iv)["Apparent", "Eavg"],
    ICI.oc = summary(mod_iv)["Corrected", "Eavg"],
    E50.app = summary(mod_iv)["Apparent", "E50"],
    E50.oc = summary(mod_iv)["Corrected", "E50"],
    E90.app = summary(mod_iv)["Apparent", "E90"],
    E90.oc = summary(mod_iv)["Corrected", "E90"]
    )

res_optcal <- matrix(c(optsum["ICI.app"],
                       optsum["ICI.oc"],
                       
                       optsum["E50.app"],
                       optsum["E50.oc"],
                       
                       optsum["E90.app"],
                       optsum["E90.oc"]
                       ),
                     nrow = 1,
                     ncol = 6,
                     byrow = T,
                     dimnames = list(
                       c("Value"),
                       rep(c("Apparent", "Corrected"), 3))
                     )

# Present the results:
res_optcal %>% 
  make_kable()  %>% 
  add_header_above(c(" " = 1,  
                     "ICI" = 2, 
                     "E50" = 2,
                     "E90" = 2))
ICI
E50
E90
Apparent Corrected Apparent Corrected Apparent Corrected
Value 0.017 0.022 0.015 0.016 0.036 0.05

C-statistic

Equivalent to the AUROC in the logistic regression model setting.

The CI is a bit wide, as would be expected for a small dataset. Correction for optimism still indicates a good model (at least by this measure).

# With rms::bootcov
# C-statistic (apparent)
# as.vector(M0$stats["C"])

# Bootstrap CI for the C-statistic:
g <- rms::bootcov(M0, stat = 'C', seed = 4, B = 1000)
# The CI limits:
g.quan <- quantile(g$boot.stats, c(.025, .975))
# Lower CI bound:
# g.quan[1]
# Upper CI bound:
# g.quan[2]

# Optimism-corrected:
# The 1st row in M0.boot is Dxy, which is Somer's D. It is a transformed version of the C-statistic
C_adj <- 0.5*(M0.boot[1, 5] + 1)

res_C <- matrix(c(as.vector(M0$stats["C"]),
                  g.quan[1],
                  g.quan[2],
                  C_adj,
                  NA,
                  NA
                  ),
                nrow = 1,
                ncol = 6,
                byrow = T,
                dimnames = list(
                  c(""),
                  rep(c("Estimate", "Lower.95", "Upper.95"), 2))
                )

# Present the results:
res_C %>% 
  make_kable()  %>% 
  add_header_above(c(" " = 1,  
                     "Apparent" = 3, 
                     "Internal" = 3
                     )
                   )
Apparent
Internal
Estimate Lower.95 Upper.95 Estimate Lower.95 Upper.95
0.798 0.755 0.869 0.773 NA NA

Se, Sp, YI

# 1. Prepare predictions dataframe:
lr_pred <-
  data.frame(truth = X.sel$wm, response = pred)

# 2. Calculate apparent performance
lr_apparent_perf <- calc_metrics_cutpointr(lr_pred)

# 3. Bootstrap process
fmla <- formula(paste("wm", "~", 
                      paste(c("rms::rcs(t2m_mean_to_4dap, 3)",
                              "rms::rcs(sm_40dap_to_49dap, 3)",
                              "rms::rcs(stsm_35dap_to_44dap, 3)", 
                              "rms::rcs(rain36to50dap, 3)"),
                            collapse = "+")))


lr_boot_fxn <- function(i) {
  # Print the current iteration
  cat("Currently processing iteration: ", i, "\n")
  
  # a. Generate bootstrap sample
  boot_sample <- rsample::bootstraps(X.sel, times = 1, strata = wm)$splits[[1]] %>%
    rsample::analysis()
  
  # b. Fit model on bootstrap sample with error handling
  boot_model <- tryCatch(
    expr = {
       rms::lrm(fmla, data = boot_sample, x = T, y = T)
    },
    error = function(e) {
      cat("Error occurred in iteration", i, ":", e$message, "\n")
      return(NULL) # Return NULL if an error occurs
    }
  )
  
  # If boot_model is NULL, skip this iteration
  if (is.null(boot_model)) {
    cat("Skipping iteration", i, "due to error.\n")
    return(NULL)
  }
  
  
  # c. Calculate metrics on bootstrap sample (training performance)
  suppressWarnings(boot_pred <- predict(boot_model, type = "fitted.ind"))
  
  boot_pred_df <- data.frame(truth = boot_sample$wm, response = boot_pred)
  
  train_perf <- calc_metrics_cutpointr(boot_pred_df)
  
  # d & e. Apply bootstrap model to original dataset and calculate metrics (test performance)
  suppressWarnings(test_pred <- predict(boot_model, newdata = X.sel, type = "fitted.ind"))
  
  test_pred_df <- data.frame(truth = X.sel$wm, response = test_pred)
  
  test_perf <- calc_metrics_cutpointr(test_pred_df)
  
  # f. Calculate optimism
  optimism <- purrr::map2_dbl(train_perf, test_perf, \(x, y) x-y)
  
  tibble(
    iteration = i,
    metric = names(train_perf),
    train = unlist(train_perf),
    test = unlist(test_perf),
    optimism = optimism)
  }


# Run lr_boot_fxn in parallel:
set.seed(5987)
my.seeds <- c(1:5000)

plan(list(tweak(multisession, workers = 5), sequential))
tictoc::tic()
bootstrap_results <- 
  furrr::future_map(my.seeds, lr_boot_fxn, .options = furrr_options(seed = TRUE)) %>% 
  purrr::list_rbind()
tictoc::toc()  # 50 sec

plan(sequential)


# 4. Apparent performance bootstrap confidence intervals
lr_apparent_CI <-
  bootstrap_results %>%
  dplyr::group_by(metric) %>%
  dplyr::summarize(app_LCI = quantile(train, .025), app_UCI = quantile(train, .975))


# 5. Average optimism across all bootstrap samples
avg_optimism <- 
  bootstrap_results %>%
  dplyr::group_by(metric) %>%
  dplyr::summarize(avg_optimism = mean(optimism))

# 6. Calculate optimism-corrected performance
lr_corrected_perf <- 
  tibble(
    metric = names(lr_apparent_perf),
    apparent = unlist(lr_apparent_perf)
    ) %>%
  dplyr::left_join(lr_apparent_CI, by = "metric") %>%
  dplyr::left_join(avg_optimism, by = "metric") %>%
  dplyr::mutate(corrected = apparent - avg_optimism)


# Save the results:
save(lr_pred, lr_apparent_perf, bootstrap_results, lr_corrected_perf, file = here::here("Modeling", "LRIIoptimism.RData"))
# lr_pred, lr_apparent_perf, bootstrap_results, lr_corrected_perf
load(here::here("Modeling", "LRIIoptimism.RData"))  

lr_corrected_perf %>% 
  make_kable()
metric apparent app_LCI app_UCI avg_optimism corrected
1 YI 0.461 0.410 0.637 0.060 0.401
2 Se 0.836 0.575 0.932 0.049 0.786
3 Sp 0.625 0.565 0.908 0.010 0.615
4 AUC 0.798 0.754 0.867 0.027 0.771
5 brier 0.129 0.108 0.140 -0.007 0.136

Session Info

sessionInfo()
R version 4.4.1 (2024-06-14 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)

Matrix products: default


locale:
[1] LC_COLLATE=Portuguese_Brazil.utf8  LC_CTYPE=Portuguese_Brazil.utf8   
[3] LC_MONETARY=Portuguese_Brazil.utf8 LC_NUMERIC=C                      
[5] LC_TIME=Portuguese_Brazil.utf8    

time zone: America/Sao_Paulo
tzcode source: internal

attached base packages:
 [1] splines   grid      tools     compiler  parallel  stats     graphics 
 [8] grDevices datasets  utils     methods   base     

other attached packages:
  [1] cards_0.4.0             multcomp_1.4-26         fastmatch_1.1-6        
  [4] cowplot_1.1.3           rlang_1.1.4             insight_1.2.0          
  [7] scales_1.3.0            mvtnorm_1.3-2           viridisLite_0.4.2      
 [10] listenv_0.9.1           MatrixModels_0.5-3      globals_0.16.3         
 [13] Rcpp_1.0.13             munsell_0.5.1           systemfonts_1.1.0      
 [16] rpart_4.1.23            cli_3.6.3               codetools_0.2-20       
 [19] evaluate_1.0.1          yaml_2.3.10             stringi_1.8.4          
 [22] RConics_1.1.1           xfun_0.48               svglite_2.1.3          
 [25] bookdown_0.41           gridExtra_2.3           tidyselect_1.2.1       
 [28] renv_1.1.2              BBmisc_1.13             pillar_1.9.0           
 [31] markdown_1.13           utf8_1.2.4              xml2_1.3.6             
 [34] hms_1.1.3               data.table_1.16.2       tzdb_0.4.0             
 [37] gtable_0.3.5            generics_0.1.3          cluster_2.1.6          
 [40] nlme_3.1-164            Metrics_0.1.4           glue_1.8.0             
 [43] nnet_7.3-19             future.apply_1.11.2     parallelMap_1.5.1      
 [46] foreign_0.8-86          quantreg_5.98           SparseM_1.84-2         
 [49] R.utils_2.12.3          R.oo_1.27.0             R.methodsS3_1.8.2      
 [52] highr_0.11              backports_1.5.0         htmlTable_2.4.3        
 [55] doParallel_1.0.17       foreach_1.5.2           withr_3.0.2            
 [58] here_1.0.1              timechange_0.3.0        fansi_1.0.6            
 [61] yardstick_1.3.1         randomForest_4.7-1.2    labeling_0.4.3         
 [64] textshaping_0.4.0       rprojroot_2.0.4         colorspace_2.1-1       
 [67] digest_0.6.37           fastmap_1.2.0           R6_2.5.1               
 [70] Matrix_1.7-0            pkgconfig_2.0.3         iterators_1.0.14       
 [73] lifecycle_1.0.4         commonmark_1.9.2        gt_0.11.1              
 [76] zoo_1.8-12              sandwich_3.1-1          plyr_1.8.9             
 [79] DiceKriging_1.6.0       htmlwidgets_1.6.4       parallelly_1.38.0      
 [82] sass_0.4.9              Formula_1.2-5           xgboost_1.7.8.1        
 [85] haven_2.5.4             polspline_1.1.25        htmltools_0.5.8.1      
 [88] base64enc_0.1-3         vctrs_0.6.5             ragg_1.3.3             
 [91] rmarkdown_2.28          farver_2.1.2            TH.data_1.1-2          
 [94] MASS_7.3-60.2           survival_3.6-4          magrittr_2.0.3         
 [97] jsonlite_1.8.9          rstudioapi_0.17.0       RColorBrewer_1.1-3     
[100] R.devices_2.17.2        CalibrationCurves_2.0.3 rms_6.9-0              
[103] Hmisc_5.2-1             tictoc_1.2.1            shapviz_0.9.6          
[106] kernelshap_0.7.0        lattice_0.22-6          patchwork_1.3.0        
[109] predtools_0.0.3         VSURF_1.2.0             pdp_0.8.2              
[112] vip_0.4.1               pROC_1.18.5             bayestestR_0.15.3      
[115] furrr_0.3.1             future_1.34.0           cutpointr_1.2.0        
[118] tuneRanger_0.7          lhs_1.2.0               mlrMBO_1.1.5.1         
[121] smoof_1.6.0.3           checkmate_2.3.2         mlr_2.19.2             
[124] ParamHelpers_1.14.1     ranger_0.17.0           kableExtra_1.4.0       
[127] gtsummary_2.0.4         labelled_2.13.0         rsample_1.2.1          
[130] lubridate_1.9.3         forcats_1.0.0           stringr_1.5.1          
[133] dplyr_1.1.4             purrr_1.0.2             readr_2.1.5            
[136] tidyr_1.3.1             tibble_3.2.1            ggplot2_3.5.1          
[139] tidyverse_2.0.0         iml_0.11.3              knitr_1.48             

loaded via a namespace (and not attached):
[1] pmcalibration_0.1.0 pminternal_0.0.1    mgcv_1.9-1         
[4] chk_0.9.2           pbapply_1.7-2